2024/2025
Authors: Daniel Gutierrez & Fabio Pimentel
# Required Packages--------------------
rm(list = ls())
library(readxl)
library(ggplot2)
library(GGally)
library(dplyr)
library(lubridate)
library(corrplot)
library(feasts)
library(tsibble)
library(forecast)
library(tidyr)
library(ggthemes)
library(car)
library(DIMORA)
library(tseries)
library(lmtest)
library(prophet)
#setwd('/Users/fabiopimentel/Documents/Padua/clases/segundo año primer semestre/BEF data/proyecto/time_series_padova-main')
# target variable
sales <- read_excel("data/sales/sales_dimsum_31102024.xlsx")
sales[is.na(sales)] <- 0 # set to zero na values
# economic variables
eco_growth <- read_excel("data/macroeconomic/economic_activity.xlsx")
fx <- read_excel("data/macroeconomic/fx.xlsx") #Foreign exchange is the conversion of one currency into another
inflation <- read_excel("data/macroeconomic/inflation.xlsx")
unemployment <- read_excel("data/macroeconomic/unemployment.xlsx")
# other variables
google_trends <- read_excel("data/other/google_trends_restaurantes.xlsx")
rain <- read_excel("data/other/rain_proxy.xlsx")
temp <- read_excel("data/other/temperature_data.xlsx")
temp[is.na(temp)] <- 0
rain[is.na(rain)] <- 0
plot(temp$tavg) # no zeros in temp : OK
plot(temp$tmedian) # no zeros in temp : OK- looks better than mean
str(sales)
tibble [1,095 × 4] (S3: tbl_df/tbl/data.frame)
$ date : POSIXct[1:1095], format: "2021-11-01" ...
$ sales_cop: num [1:1095] 0 0 0 0 0 0 0 0 0 0 ...
$ bar : num [1:1095] 0 0 0 0 0 0 0 0 0 0 ...
$ food : num [1:1095] 0 0 0 0 0 0 0 0 0 0 ...
str(eco_growth)
tibble [238 × 2] (S3: tbl_df/tbl/data.frame)
$ date : POSIXct[1:238], format: "2005-01-01" ...
$ ise_original: num [1:238] 59.8 61.5 62.2 63 62.7 ...
str(fx) # Foreign exchange is the conversion of one currency into another
tibble [12,020 × 2] (S3: tbl_df/tbl/data.frame)
$ date: POSIXct[1:12020], format: "1991-11-27" ...
$ fx : num [1:12020] 693 694 695 695 643 ...
str(inflation)
tibble [832 × 2] (S3: tbl_df/tbl/data.frame)
$ date : POSIXct[1:832], format: "1955-07-01" ...
$ inflation: num [1:832] -0.0087 -0.0001 0.0084 0.0077 0.0144 0.0203 0.021 0.0257 0.031 0.0304 ...
str(unemployment)
tibble [286 × 2] (S3: tbl_df/tbl/data.frame)
$ date : POSIXct[1:286], format: "2001-01-01" ...
$ unemployment: num [1:286] 16.6 17.4 15.8 14.5 14 ...
str(google_trends)
tibble [262 × 2] (S3: tbl_df/tbl/data.frame)
$ date : POSIXct[1:262], format: "2019-11-10" ...
$ google_trends: num [1:262] 64 60 63 65 69 63 80 98 75 63 ...
str(rain)
tibble [56,648 × 6] (S3: tbl_df/tbl/data.frame)
$ date : POSIXct[1:56648], format: "2021-01-01" ...
$ region : chr [1:56648] "ANTIOQUIA" "ANTIOQUIA" "ANTIOQUIA" "ANTIOQUIA" ...
$ river : chr [1:56648] "A. SAN LORENZO" "CARLOS LLERAS" "CONCEPCION" "DESV. EEPPM (NEC,PAJ,DOL)" ...
$ contribution_m3s : num [1:56648] 19.29 20.31 3.61 7.48 19.5 ...
$ contrribution_kWh: num [1:56648] 4354800 3143300 1011700 2096000 899700 ...
$ contribution_pct : num [1:56648] 0.798 0.79 0.712 0.944 0.638 ...
str(temp) # this has NaNs, must fill somehow
tibble [1,113 × 4] (S3: tbl_df/tbl/data.frame)
$ date : POSIXct[1:1113], format: "2021-10-31" ...
$ tavg : num [1:1113] 19.8 18.3 18.4 18.3 18.6 19.8 19.9 19.7 18.8 18.7 ...
$ prcp : num [1:1113] 0 0 0 0 0 0 0 0 0 0 ...
$ tmedian: num [1:1113] 19.8 18.3 18.4 18.3 18.6 19.8 19.9 19.7 18.8 18.7 ...
# create time variables
plot(sales$sales_cop)
# sales
## sales monthly
df_sales_m <- sales %>%
mutate(month = floor_date(date, "month")) %>% # Extract month
group_by(month) %>%
summarise(sales_m = sum(sales_cop), bar_m = sum(bar), food_m = sum(food)
) # Summing values
head(df_sales_m)
## sales weekly
df_sales_w <- sales %>%
mutate(week = floor_date(date, "week")) %>% # Extract month
group_by(week) %>%
summarise(sales_w = sum(sales_cop), bar_w = sum(bar), food_w = sum(food)) # Summing values
head(df_sales_w)
# fx
df_fx_m <- fx %>%
mutate(month = floor_date(date, "month")) %>%
group_by(month) %>%
summarise(fx_m = mean(fx))
df_fx_w <- fx %>%
mutate(week = floor_date(date, "week")) %>%
group_by(week) %>%
summarise(fx_w = mean(fx))
head(df_fx_m)
head(df_fx_w)
# google trends
# montly
df_google_m <- google_trends %>%
mutate(month = floor_date(date, "month")) %>%
group_by(month) %>%
summarise(google_m = mean(google_trends))
# weekly
df_google_w <- google_trends %>%
mutate(week = floor_date(date, "week")) %>%
group_by(week) %>%
summarise(google_w = mean(google_trends))
head(df_google_m)
head(df_google_w)
## rain
df_rain_g = rain %>%
group_by(date, region) %>%
summarise(rain_sum=sum(contribution_m3s))
`summarise()` has grouped output by 'date'. You can override using the `.groups` argument.
df_rain_g <- df_rain_g[df_rain_g$region=="ANTIOQUIA",]
head(df_rain_g)
# montly
df_rain_m <- df_rain_g %>%
mutate(month = floor_date(date, "month")) %>%
group_by(month) %>%
summarise(rain_m = sum(rain_sum))
# weekly
df_rain_w <- df_rain_g %>%
mutate(week = floor_date(date, "week")) %>%
group_by(week) %>%
summarise(rain_w = sum(rain_sum))
head(df_rain_m)
head(df_rain_w)
# temperature
# montly
df_temp_m <- temp %>%
mutate(month = floor_date(date, "month")) %>%
group_by(month) %>%
summarise(temp_m = mean(tavg), prcp_m = sum(prcp))
# weekly
df_temp_w <- temp %>%
mutate(week = floor_date(date, "week")) %>%
group_by(week) %>%
summarise(temp_w = mean(tavg), prcp_w = sum(prcp))
head(df_temp_m)
head(df_temp_w)
## daily data----------
#sales, rain, fx are the only ones daily
df_merged_d <- merge(sales, df_rain_g, by = "date", all = FALSE) # Inner join
df_merged_d <- merge(df_merged_d, fx, by = "date", all = FALSE) # Inner join
df_merged_d <- merge(df_merged_d, temp, by = "date", all = FALSE) # Inner join
head(df_merged_d)
NA
### weekly data----------
df_merged_w <- merge(df_sales_w, df_rain_w, by="week", all=F)
df_merged_w <- merge(df_merged_w, df_google_w, by="week", all=F)
df_merged_w <- merge(df_merged_w, df_fx_w, by="week", all=F)
df_merged_w <- merge(df_merged_w, df_temp_w, by="week", all=F)
head(df_merged_w)
### monthly data----------
# change colnames
names(eco_growth) <- c("month", "ise")
names(inflation) <- c("month", "inflation")
names(unemployment) <- c("month", "unemployment")
df_merged_m <- merge(df_sales_m, df_rain_m, by="month", all=F)
nrow(df_merged_m)
[1] 36
df_merged_m <- merge(df_merged_m, df_fx_m, by="month", all=F)
nrow(df_merged_m)
[1] 36
df_merged_m <- merge(df_merged_m, df_google_m, by="month", all=F)
nrow(df_merged_m)
[1] 36
df_merged_m <- merge(df_merged_m, eco_growth, by="month", all=F) # only has until aug 2024
nrow(df_merged_m)
[1] 36
df_merged_m <- merge(df_merged_m, inflation, by="month", all=F)
nrow(df_merged_m)
[1] 36
df_merged_m <- merge(df_merged_m, unemployment, by="month", all=F)
nrow(df_merged_m)
[1] 36
df_merged_m <- merge(df_merged_m, df_temp_m, by="month", all=F)
nrow(df_merged_m)
[1] 36
objects_to_keep <- c("df_merged_d", "df_merged_w", "df_merged_m")
# Remove all objects except those specified
rm(list = setdiff(ls(), objects_to_keep))
# sales daily
ggplot(
df_merged_d,
aes(x=date, y=sales_cop)
) + geom_line() + ggtitle("Daily Sales of Restaurant")
# sales weekly
ggplot(df_merged_w, aes(x=week, y=sales_w)) +
geom_line() + ggtitle("Weekly Sales of Restaurant")
# sales montly
ggplot(df_merged_m, aes(x=month, y=sales_m)) +
geom_line() + ggtitle("Monthly Sales of Restaurant")
We want to move to a stacked bar chart when we care about the relative decomposition of each primary bar based on the levels of a second categorical variable. Each bar is now comprised of a number of sub-bars, each one corresponding with a level of a secondary categorical variable. The total length of each stacked bar is the same as before, but now we can see how the secondary groups contributed to that total.
One important consideration in building a stacked bar chart is to decide which of the two categorical variables will be the primary variable (dictating major axis positions and overall bar lengths) and which will be the secondary (dictating how each primary bar will be subdivided). The most ‘important’ variable should be the primary; use domain knowledge and the specific type of categorical variables to make a decision on how to assign your categorical variables
#Monthly
# Reshape the data to a long format
df_sales_m_long <- df_merged_m %>%
pivot_longer(cols = c(bar_m, food_m), names_to = "Category", values_to = "Value")
# Create the stacked bar plot
ggplot(df_sales_m_long, aes(x = month, y = Value, fill = Category)) +
geom_bar(stat = "identity", position = "stack") +
ggtitle("Monthly Sales of Restaurant") +
labs(y = "Sales", x = "Month", fill = "Category") +
theme_minimal()
# Weekly
# Reshape the data to a long format
df_sales_w_long <- df_merged_w %>%
pivot_longer(cols = c(bar_w, food_w), names_to = "Category", values_to = "Value")
# Create the stacked bar plot
ggplot(df_sales_w_long, aes(x = week, y = Value, fill = Category)) +
geom_bar(stat = "identity", position = "stack") +
ggtitle("Weekly Sales of Restaurant") +
labs(y = "Sales", x = "Week", fill = "Category") +
theme_minimal()
# Seasonal plots
df_sales_w_filtered <- df_merged_w %>%
filter(week >= ymd("2021-12-31"))
tseries_w <- ts(df_sales_w_filtered$sales_w , start = c(2022, 1), frequency = 52)
head(tseries_w)
Time Series:
Start = c(2022, 1)
End = c(2022, 6)
Frequency = 52
[1] 5654618 5894371 8308052 9164178 11934123 9777570
seasonplot(tseries_w, col = rainbow(3), year.labels = TRUE, main = "Seasonal Plot")
text(x = 1, y = max(tseries_w) - 1.5e7, labels = "2024", col = "blue")
NA
NA
# seasonplot monthly
df_sales_m_filtered <- df_merged_m %>%
filter(month >= ymd("2021-12-31"))
tseries_m <- ts(df_sales_m_filtered$sales_m , start = c(2022, 1), frequency = 12)
head(tseries_m)
Jan Feb Mar Apr May Jun
2022 31953282 41926179 37466677 57926246 79622843 88816435
seasonplot(tseries_m, col = rainbow(3), year.labels = TRUE, main = "Seasonal Plot")
text(x = 1, y = max(tseries_m) - 1e6, labels = "2024", col = "blue")
# Montly Density
# Select the columns of interest
variables <- c("sales_m", "bar_m", "food_m", "rain_m", "fx_m", "google_m",
"ise", "inflation", "unemployment", "temp_m", "prcp_m")
# Transform the data to long format for ggplot2
df_long_m <- df_merged_m %>%
pivot_longer(cols = all_of(variables), names_to = "Variable", values_to = "Value")
# Create the grid of density plots
ggplot(df_long_m, aes(x = Value)) +
geom_density(fill = "blue", alpha = 0.4) +
facet_wrap(~ Variable, scales = "free", ncol = 3) +
labs(title = "Density Plots of Selected Variables",
x = "Value", y = "Density") +
theme_minimal()
# Weekly Density
# Select the columns of interest
variables <- c("sales_w", "bar_w", "food_w", "rain_w", "fx_w", "google_w",
"temp_w", "prcp_w")
df_long_w <- df_merged_w %>%
pivot_longer(cols = all_of(variables), names_to = "Variable", values_to = "Value")
# Create the grid of density plots
ggplot(df_long_w, aes(x = Value)) +
geom_density(fill = "blue", alpha = 0.4) +
facet_wrap(~ Variable, scales = "free", ncol = 3) +
labs(title = "Density Plots of Selected Variables",
x = "Value", y = "Density") +
theme_minimal()
# Daily Density
# Select the columns of interest
variables <- c("sales_cop", "bar", "food", "rain_sum", "fx",
"tmedian", "prcp")
df_long_d <- df_merged_d %>%
pivot_longer(cols = all_of(variables), names_to = "Variable", values_to = "Value")
# Create the grid of density plots
ggplot(df_long_d, aes(x = Value)) +
geom_density(fill = "blue", alpha = 0.4) +
facet_wrap(~ Variable, scales = "free", ncol = 3) +
labs(title = "Density Plots of Selected Variables",
x = "Value", y = "Density") +
theme_minimal()
### 3.5.1 economic variables-----------------------
# economic growth
ggplot(df_merged_m, aes(x=month, y=ise)) +
geom_line() + ggtitle("Monthly activity in Colombia")
# clearly seasonal and trend
# fx
ggplot(df_merged_d, aes(x=date, y=fx)) +
geom_line() + ggtitle("Daily COP/USD")
# trend but no clear seasonality
# inflation
ggplot(df_merged_m, aes(x=month, y=inflation)) +
geom_line() + ggtitle("Monthly inflation National")
# business cycles, no tend or seasonality
# unemployment
ggplot(df_merged_m, aes(x=month, y=unemployment)) +
geom_line() + ggtitle("Montly trailing unemployment Medellin")
# seasonal and trend downwards
### 3.5.2 Other variables
# google trends
ggplot(df_merged_w, aes(x=week, y=google_w)) +
geom_line() + ggtitle("Weelkly Google trends 'Restaurantes'")
# no clear behaviour, drop in pandemic
# rain
ggplot(df_merged_d, aes(x=date, y=rain_sum)) +
geom_line() + ggtitle("Daily rain approximated in Antioquia")
# no trend or seasonality clearly
# temperature
ggplot(df_merged_d, aes(x=date, y=tmedian)) +
geom_line() + ggtitle("Daily Median temperature in Medellin")
# almost stationary
# temperature
ggplot(df_merged_d, aes(x=date, y=tavg)) +
geom_line() + ggtitle("Daily Average temperature in Medellin")
# this one looks weird, better keep working on median
# precipitation from temp
ggplot(df_merged_d, aes(x=date, y=prcp)) +
geom_line() + ggtitle("Daily precipitation in Medellin")
# looks decent
df_merged_d <- subset(df_merged_d, select = -region)
# daily
ggpairs(df_merged_d,
columns = 2:8)
# sales have correl with fx and rain_sum
# weekly
ggpairs(df_merged_w,
columns = 2:9)
# sales have correl with rain, google, fx, temp
# bar has more correl with temp
# montly
ggpairs(df_merged_m,
columns = 2:12)
# Exclude 'date' column
numeric_df_d <- df_merged_d[, sapply(df_merged_d, is.numeric)]
cor_matrix_d <- cor(numeric_df_d, use = "complete.obs") # Use only complete rows
cor_matrix_d
sales_cop bar food rain_sum
sales_cop 1.000000000 0.895118281 0.98135791 0.1662524
bar 0.895118281 1.000000000 0.79301917 0.1275670
food 0.981357913 0.793019172 1.00000000 0.1716482
rain_sum 0.166252448 0.127567019 0.17164822 1.0000000
fx 0.146774104 0.136714885 0.14261504 0.6350092
tavg 0.008884659 0.089640797 -0.02562481 -0.1340051
prcp 0.028085504 -0.001337552 0.03913167 0.1857676
tmedian 0.108891610 0.111686146 0.10120437 -0.2717930
fx tavg prcp tmedian
sales_cop 0.14677410 0.008884659 0.028085504 0.10889161
bar 0.13671488 0.089640797 -0.001337552 0.11168615
food 0.14261504 -0.025624811 0.039131672 0.10120437
rain_sum 0.63500919 -0.134005147 0.185767588 -0.27179302
fx 1.00000000 0.164550220 0.055565287 -0.07710937
tavg 0.16455022 1.000000000 -0.267367632 0.65617098
prcp 0.05556529 -0.267367632 1.000000000 -0.22103872
tmedian -0.07710937 0.656170981 -0.221038724 1.00000000
numeric_df_w <- df_merged_w[, sapply(df_merged_w, is.numeric)]
cor_matrix_w <- cor(numeric_df_w, use = "complete.obs") # Use only complete rows
cor_matrix_w
sales_w bar_w food_w rain_w
sales_w 1.00000000 0.8910033 0.98922211 0.2913161
bar_w 0.89100331 1.0000000 0.81506933 0.2875652
food_w 0.98922211 0.8150693 1.00000000 0.2785842
rain_w 0.29131607 0.2875652 0.27858415 1.0000000
google_w -0.44467552 -0.3551772 -0.45333238 -0.1073483
fx_w 0.24395629 0.2873958 0.22038752 0.6626647
temp_w -0.02746535 0.1275798 -0.07507291 -0.1062959
prcp_w 0.17241188 0.1211778 0.18095801 0.3289820
google_w fx_w temp_w prcp_w
sales_w -0.444675519 0.243956293 -0.02746535 0.1724119
bar_w -0.355177208 0.287395824 0.12757976 0.1211778
food_w -0.453332380 0.220387520 -0.07507291 0.1809580
rain_w -0.107348280 0.662664730 -0.10629590 0.3289820
google_w 1.000000000 -0.002777267 0.20849760 -0.2516305
fx_w -0.002777267 1.000000000 0.19315256 0.1078585
temp_w 0.208497604 0.193152564 1.00000000 -0.3047876
prcp_w -0.251630474 0.107858510 -0.30478758 1.0000000
numeric_df_m <- df_merged_m[, sapply(df_merged_m, is.numeric)]
cor_matrix_m <- cor(numeric_df_m, use = "complete.obs") # Use only complete rows
cor_matrix_m
sales_m bar_m food_m rain_m
sales_m 1.00000000 0.91772972 0.9930535 0.345901419
bar_m 0.91772972 1.00000000 0.8646613 0.357818748
food_m 0.99305351 0.86466133 1.0000000 0.331685570
rain_m 0.34590142 0.35781875 0.3316856 1.000000000
fx_m 0.26008119 0.33043881 0.2325461 0.700900212
google_m -0.60141163 -0.43583265 -0.6306910 -0.127328581
ise 0.14184921 0.09197376 0.1528346 0.143925313
inflation 0.24620155 0.41865460 0.1885633 0.554477494
unemployment -0.47774546 -0.35355440 -0.4999277 -0.314673431
temp_m -0.05360312 0.12366320 -0.1041492 -0.004770513
prcp_m 0.28057040 0.20114717 0.2956418 0.364914544
fx_m google_m ise inflation
sales_m 0.260081195 -0.60141163 0.141849210 0.2462015
bar_m 0.330438807 -0.43583265 0.091973761 0.4186546
food_m 0.232546131 -0.63069104 0.152834568 0.1885633
rain_m 0.700900212 -0.12732858 0.143925313 0.5544775
fx_m 1.000000000 -0.01630270 0.006203153 0.7315857
google_m -0.016302700 1.00000000 0.093304022 0.1600715
ise 0.006203153 0.09330402 1.000000000 -0.1118127
inflation 0.731585721 0.16007149 -0.111812691 1.0000000
unemployment -0.148857922 0.42550422 -0.602525241 -0.1308017
temp_m 0.242855832 0.33976804 -0.190530186 0.5982503
prcp_m 0.106392263 -0.37823361 0.047259121 0.0657641
unemployment temp_m prcp_m
sales_m -0.477745462 -0.053603118 0.28057040
bar_m -0.353554400 0.123663200 0.20114717
food_m -0.499927664 -0.104149167 0.29564181
rain_m -0.314673431 -0.004770513 0.36491454
fx_m -0.148857922 0.242855832 0.10639226
google_m 0.425504221 0.339768042 -0.37823361
ise -0.602525241 -0.190530186 0.04725912
inflation -0.130801728 0.598250278 0.06576410
unemployment 1.000000000 -0.007286939 -0.41599220
temp_m -0.007286939 1.000000000 -0.28014446
prcp_m -0.415992203 -0.280144464 1.00000000
# Plot the Correlation Matrix
par(mfrow=c(1,1))
corrplot(cor_matrix_d, method = "color", type = "upper", tl.col = "black", tl.srt = 45)
corrplot(cor_matrix_w, method = "color", type = "upper", tl.col = "black", tl.srt = 45)
corrplot(cor_matrix_m, method = "color", type = "upper", tl.col = "black", tl.srt = 45)
Rain has stronger correlation than prcp, so we drop prcp to not repeat the same variable from two sources Also we drop average temperature because median temperature seems more trustworthy
# drop prcp beacuse they "are the same"
df_merged_m <- df_merged_m %>% select(-prcp_m)
df_merged_w <- df_merged_w %>% select(-prcp_w)
df_merged_d <- df_merged_d %>% select(-prcp)
# drop avg temp
df_merged_d <- df_merged_d %>% select(-tavg)
colnames(df_merged_d)
[1] "date" "sales_cop" "bar" "food" "rain_sum"
[6] "fx" "tmedian"
### drop everything not on use
objects_to_keep <- c("df_merged_d", "df_merged_w", "df_merged_m")
# Remove all objects except those specified
rm(list = setdiff(ls(), objects_to_keep))
POSIXct and POSIXlt Classes
Times and date-times are represented by the POSIXct or the POSIXlt class in R. The POSIXct format stores date and time in seconds with the number of seconds beginning at January 1, 1970, so a POSIXct date-time is essentially an single value on a timeline. Date-times prior to 1970, will be negative numbers. The POSIXlt class stores other date and time information in a list such as hour of day of week, month of year, etc. The starting year for POSIXlt data is 1900, so 2022 would be stored as year 122. Months also begin at 0, so January is stored as month 0 and February as month 1. For both POSIX classes, the timezone can be classified. While date-times stored as POSIXct and POSIXlt look similar, when you unclass them with the unclass() function, you can see the additional information stored within the POSIXlt data.
Date Class
Dates without time can simply be stored as a Date class in R using the as.Date() function. Both Dates and POXIC classes need to be defined based on how they formatted. When uploading time series data into R, date and date-time data is typically uploaded as a character class and must be converted to date or time class using the as.Date(), as.POSIXct() or as.POSIXlt() functions.
Monthly
# Vars for model
# Month
# Ensure the `month` column is in POSIXct format
df_merged_m$month <- as.POSIXct(df_merged_m$month)
# Create the numeric variable: an evenly increasing number
df_merged_m <- df_merged_m %>%
arrange(month) %>% # Ensure data is sorted by month
mutate(numeric_month = row_number()) # Assign an increasing number
# Create the seasonal variable: the 12 different months as a factor
df_merged_m <- df_merged_m %>%
mutate(seasonal_month = factor(format(month, "%B"), levels = month.name)) # Month names as ordered factors
Weekly
# Week
# Ensure the `week` column is in POSIXct format
df_merged_w$week <- as.POSIXct(df_merged_w$week)
# Create the numeric variable: an evenly increasing number
df_merged_w <- df_merged_w %>%
arrange(week) %>% # Ensure data is sorted by week
mutate(numeric_week = row_number()) # Assign an increasing number
# Create the seasonal variable: the 12 different months as a factor
df_merged_w <- df_merged_w %>%
mutate(seasonal_month = factor(format(week, "%B"), levels = month.name)) # Month names as ordered factors
Daily
# Day
# Ensure the `day` column is in POSIXct format
df_merged_d$date <- as.POSIXct(df_merged_d$date)
# Create the numeric variable: an evenly increasing number
df_merged_d <- df_merged_d %>%
arrange(date) %>% # Ensure data is sorted by day
mutate(numeric_day = row_number()) # Assign an increasing number
# Create the seasonal variable: the 12 different months as a factor
df_merged_d <- df_merged_d %>%
mutate(seasonal_month = factor(format(date, "%B"), levels = month.name)) # Month names as ordered factors
# Create a column indicating the day of the week
df_merged_d <- df_merged_d %>%
mutate(day_of_week = factor(weekdays(date), levels = c("Monday", "Tuesday", "Wednesday",
"Thursday", "Friday", "Saturday", "Sunday"))) # Day of the week as ordered factor
Convert sales to time series objects for the use in several models
# convert to time series
sales_d_ts <- ts(df_merged_d$sales_cop)
sales_w_ts <- ts(df_merged_w$sales_w)
sales_m_ts <- ts(df_merged_m$sales_m)
par(mfrow=c(1,1))
# Daily
tsdisplay(sales_d_ts)
# is not stationary but has no clear trend
# and seasonality every 7 days
# Weekly
tsdisplay(sales_w_ts)
# not stationary: has trend
# Montly
tsdisplay(sales_m_ts)
# has clear trend, no seasonality
Some variables are scaled to log, so we can interpret the linear models more easily. The covariates are in different scales so it is easier to interpret percentage changes instead of unit changes.
# Monthly
df_merged_m <- df_merged_m %>%
mutate(across(where(is.numeric) & !all_of(c("unemployment", "inflation")), ~ log(. + 1)))
# Weekly
df_merged_w <- df_merged_w %>%
mutate(across(where(is.numeric), ~ log(. + 1)))
# Daily
# Weekly
df_merged_d <- df_merged_d %>%
mutate(across(where(is.numeric), ~ log(. + 1)))
#par(mfrow=c(1,1))
#tsdisplay(sales_d_ts)
# is not stationary but has no clear trend
plot(sales_d_ts)
acf(sales_d_ts)
pacf(sales_d_ts)
When data are seasonal, the autocorrelation will be larger for the seasonal lags (at multiples of the seasonal period) than for other lags.
# Weekly
#tsdisplay(sales_w_ts)
plot(sales_w_ts)
acf(sales_w_ts)
pacf(sales_w_ts)
# not stationary: has trend and seasonality maybe
# Montly
#tsdisplay(sales_m_ts)
plot(sales_m_ts)
acf(sales_m_ts)
pacf(sales_m_ts)
# has clear trend, no seasonality
In this section we model the time series using various approaches to find the best model for our data. We use both linear and non linear models going from the simplest to the more “complex” models.
Functions that help us implement and analyze models faster
## Function to create and summarize models------------------
run_model <- function(formula, data, model_name) {
cat("\nRunning", model_name, "\n")
model <- lm(formula, data = data)
print(summary(model))
par(mfrow = c(2, 2))
plot(model)
return(model)
}
# Function to compare models using ANOVA
compare_models <- function(model1, model2, name1, name2) {
cat("\nComparing Models:", name1, "vs", name2, "\n")
anova_result <- anova(model1, model2)
print(anova_result)
return(anova_result)
}
# Function to add predictions to the dataset
add_predictions <- function(model, data, pred_column) {
data[[pred_column]] <- predict(model, newdata = data)
return(data)
}
# Calculate RMSE
# Function to calculate RMSE
calculate_rmse <- function(observed, predicted) {
rmse <- sqrt(mean((observed - predicted)^2, na.rm = TRUE))
return(rmse)
}
# function that compares linear models
# Define the function to get R^2 and AIC
get_model_stats <- function(models) {
# Initialize an empty data frame
stats <- data.frame(
Model = character(),
R2 = numeric(),
AIC = numeric(),
stringsAsFactors = FALSE
)
# Loop through the list of models
for (i in seq_along(models)) {
model <- models[[i]]
model_name <- names(models)[i]
# Extract R^2 and AIC
r2 <- summary(model)$r.squared
aic <- AIC(model)
# Append to the data frame
stats <- rbind(stats, data.frame(Model = model_name, R2 = r2, AIC = aic))
}
return(stats)
}
# Montly Models
# View Dataframe
head(df_merged_m)
# Model 0: Trend only
ols0 <- run_model(sales_m ~ numeric_month, df_merged_m, "Model 0")
Running Model 0
Call:
lm(formula = formula, data = data)
Residuals:
Min 1Q Median 3Q Max
-0.9910 -0.1192 0.0158 0.1476 0.4953
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 16.40311 0.18031 90.97 < 2e-16 ***
numeric_month 0.68309 0.06312 10.82 1.49e-12 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.28 on 34 degrees of freedom
Multiple R-squared: 0.775, Adjusted R-squared: 0.7684
F-statistic: 117.1 on 1 and 34 DF, p-value: 1.485e-12
df_merged_m <- add_predictions(ols0, df_merged_m, "predicted_sales0")
# Model 1: Trend + Seasonality
ols1 <- run_model(sales_m ~ numeric_month + seasonal_month, df_merged_m, "Model 1")
Running Model 1
Call:
lm(formula = formula, data = data)
Residuals:
Min 1Q Median 3Q Max
-0.7403 -0.1590 -0.0135 0.2071 0.4347
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 16.505470 0.260998 63.240 < 2e-16
numeric_month 0.657412 0.075865 8.666 1.06e-08
seasonal_monthFebruary -0.001699 0.254032 -0.007 0.995
seasonal_monthMarch -0.006230 0.254345 -0.024 0.981
seasonal_monthApril -0.063031 0.254777 -0.247 0.807
seasonal_monthMay 0.070611 0.255288 0.277 0.785
seasonal_monthJune 0.037438 0.255855 0.146 0.885
seasonal_monthJuly 0.138742 0.256462 0.541 0.594
seasonal_monthAugust -0.010812 0.257098 -0.042 0.967
seasonal_monthSeptember -0.133330 0.257755 -0.517 0.610
seasonal_monthOctober -0.058397 0.258427 -0.226 0.823
seasonal_monthNovember -0.335279 0.254924 -1.315 0.201
seasonal_monthDecember -0.016084 0.254094 -0.063 0.950
(Intercept) ***
numeric_month ***
seasonal_monthFebruary
seasonal_monthMarch
seasonal_monthApril
seasonal_monthMay
seasonal_monthJune
seasonal_monthJuly
seasonal_monthAugust
seasonal_monthSeptember
seasonal_monthOctober
seasonal_monthNovember
seasonal_monthDecember
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.311 on 23 degrees of freedom
Multiple R-squared: 0.8123, Adjusted R-squared: 0.7144
F-statistic: 8.296 on 12 and 23 DF, p-value: 8.937e-06
df_merged_m <- add_predictions(ols1, df_merged_m, "predicted_sales1")
## Model 2: Backward Stepwise Regression
# Start with the full model (excluding food and bar)
ols2_full <- lm(
sales_m ~ numeric_month + seasonal_month + unemployment + ise + fx_m +
google_m + temp_m + rain_m,
data = df_merged_m
)
# Perform backward stepwise regression
ols2_stepwise <- step(
ols2_full,
direction = "backward",
trace = 1 # Prints the stepwise regression process
)
Start: AIC=-104.31
sales_m ~ numeric_month + seasonal_month + unemployment + ise +
fx_m + google_m + temp_m + rain_m
Df Sum of Sq RSS AIC
- temp_m 1 0.00008 0.69105 -106.310
- unemployment 1 0.01523 0.70620 -105.530
<none> 0.69097 -104.314
- rain_m 1 0.07578 0.76674 -102.568
- seasonal_month 11 0.71692 1.40789 -100.691
- ise 1 0.17179 0.86276 -98.321
- fx_m 1 0.39083 1.08180 -90.176
- google_m 1 0.57575 1.26672 -84.495
- numeric_month 1 2.29957 2.99054 -53.570
Step: AIC=-106.31
sales_m ~ numeric_month + seasonal_month + unemployment + ise +
fx_m + google_m + rain_m
Df Sum of Sq RSS AIC
- unemployment 1 0.02314 0.71419 -107.125
<none> 0.69105 -106.310
- rain_m 1 0.07693 0.76798 -104.510
- seasonal_month 11 0.79109 1.48214 -100.841
- ise 1 0.17775 0.86880 -100.070
- fx_m 1 0.43032 1.12137 -90.883
- google_m 1 0.65543 1.34648 -84.297
- numeric_month 1 2.42508 3.11613 -54.089
Step: AIC=-107.12
sales_m ~ numeric_month + seasonal_month + ise + fx_m + google_m +
rain_m
Df Sum of Sq RSS AIC
<none> 0.7142 -107.125
- rain_m 1 0.0940 0.8082 -104.673
- seasonal_month 11 0.7847 1.4989 -102.437
- ise 1 0.1606 0.8748 -101.823
- fx_m 1 0.4343 1.1485 -92.022
- google_m 1 0.6466 1.3608 -85.916
- numeric_month 1 4.1118 4.8260 -40.342
# Summary of the final stepwise model
summary(ols2_stepwise)
Call:
lm(formula = sales_m ~ numeric_month + seasonal_month + ise +
fx_m + google_m + rain_m, data = df_merged_m)
Residuals:
Min 1Q Median 3Q Max
-0.31229 -0.08760 -0.01030 0.09029 0.31789
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -52.31160 19.99397 -2.616 0.016981
numeric_month 0.93626 0.08952 10.459 2.54e-09
seasonal_monthFebruary 0.27615 0.19812 1.394 0.179459
seasonal_monthMarch -0.33136 0.28825 -1.150 0.264583
seasonal_monthApril -0.03032 0.20910 -0.145 0.886232
seasonal_monthMay -0.12629 0.27633 -0.457 0.652850
seasonal_monthJune -0.16788 0.27401 -0.613 0.547361
seasonal_monthJuly -0.11655 0.30708 -0.380 0.708483
seasonal_monthAugust -0.45680 0.33544 -1.362 0.189189
seasonal_monthSeptember -0.40348 0.27930 -1.445 0.164861
seasonal_monthOctober -0.21625 0.28064 -0.771 0.450439
seasonal_monthNovember -0.67606 0.38426 -1.759 0.094603
seasonal_monthDecember -1.52898 0.67026 -2.281 0.034247
ise 7.20920 3.48786 2.067 0.052641
fx_m 2.72323 0.80115 3.399 0.003010
google_m 3.17853 0.76636 4.148 0.000547
rain_m -0.20297 0.12835 -1.581 0.130285
(Intercept) *
numeric_month ***
seasonal_monthFebruary
seasonal_monthMarch
seasonal_monthApril
seasonal_monthMay
seasonal_monthJune
seasonal_monthJuly
seasonal_monthAugust
seasonal_monthSeptember
seasonal_monthOctober
seasonal_monthNovember .
seasonal_monthDecember *
ise .
fx_m **
google_m ***
rain_m
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.1939 on 19 degrees of freedom
Multiple R-squared: 0.9397, Adjusted R-squared: 0.889
F-statistic: 18.52 on 16 and 19 DF, p-value: 2.609e-08
# Add predictions from the final stepwise model
df_merged_m <- add_predictions(ols2_stepwise, df_merged_m, "predicted_sales2")
# Plot Actual vs Predicted Values
ggplot(df_merged_m, aes(x = month)) +
geom_line(aes(y = exp(sales_m), color = "Actual Sales"), size = 1) +
geom_line(aes(y = exp(predicted_sales0), color = "Model 0"), linetype = "dashed", size = 1) +
geom_line(aes(y = exp(predicted_sales1), color = "Model 1"), linetype = "dotted", size = 1) +
geom_line(aes(y = exp(predicted_sales2), color = "Model 2 Stepwise"), linetype = "dotdash", size = 1) +
labs(title = "Actual vs Predicted Monthly Sales for All Models",
x = "Month", y = "Sales", color = "Legend") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Models to compare
models <- list(
"Model trend" = ols0,
"Model trend + season" = ols1,
"Model all covariates step" = ols2_stepwise
)
# Get R^2 and AIC for each model
model_stats <- get_model_stats(models)
# View the results
print(model_stats)
# RMSE calculation for the original (exponentiated) scale
rmse_stats <- data.frame(
Model = character(),
RMSE = numeric(),
stringsAsFactors = FALSE
)
# Loop through each model
for (i in seq_along(models)) {
model_name <- names(models)[i]
predicted_column <- paste0("predicted_sales", i - 1) # Adjust column name index
# Calculate RMSE on the original scale
rmse <- calculate_rmse(
observed = exp(df_merged_m$sales_m), # Exponentiate actual values
predicted = exp(df_merged_m[[predicted_column]]) # Exponentiate predicted values
)
# Append results to the RMSE stats table
rmse_stats <- rbind(rmse_stats, data.frame(Model = model_name, RMSE = rmse))
}
# View RMSE statistics
print(rmse_stats)
NA
rmse_ols_m <- rmse_stats$RMSE[3]
rmse_ols_m
[1] 13229295
# Weekly Models
head(df_merged_w)
## Clean Data - Drop rows 1-2 because sales are 0 / was not open yet
df_merged_w <- df_merged_w %>% slice(-1, -2)
## Model 0A: Trend only
ols0w <- run_model(sales_w ~ numeric_week, df_merged_w, "Model 0A")
Running Model 0A
Call:
lm(formula = formula, data = data)
Residuals:
Min 1Q Median 3Q Max
-0.72559 -0.15917 0.01874 0.16674 0.65873
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 14.63582 0.10036 145.84 <2e-16 ***
numeric_week 0.53273 0.02372 22.46 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.2406 on 153 degrees of freedom
Multiple R-squared: 0.7673, Adjusted R-squared: 0.7658
F-statistic: 504.5 on 1 and 153 DF, p-value: < 2.2e-16
df_merged_w <- add_predictions(ols0w, df_merged_w, "predicted_sales0")
## Model 1A: Trend + Seasonality
ols1w <- run_model(sales_w ~ numeric_week + seasonal_month, df_merged_w, "Model 1A")
Running Model 1A
Call:
lm(formula = formula, data = data)
Residuals:
Min 1Q Median 3Q Max
-0.62472 -0.14172 0.01215 0.16457 0.57707
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 14.67383 0.11499 127.613 <2e-16 ***
numeric_week 0.53053 0.02537 20.915 <2e-16 ***
seasonal_monthFebruary 0.01863 0.09432 0.197 0.844
seasonal_monthMarch -0.07386 0.09266 -0.797 0.427
seasonal_monthApril -0.04300 0.09278 -0.463 0.644
seasonal_monthMay 0.05120 0.09279 0.552 0.582
seasonal_monthJune 0.03179 0.09343 0.340 0.734
seasonal_monthJuly 0.09360 0.09164 1.021 0.309
seasonal_monthAugust -0.03924 0.09572 -0.410 0.683
seasonal_monthSeptember -0.12111 0.09425 -1.285 0.201
seasonal_monthOctober -0.08215 0.09245 -0.889 0.376
seasonal_monthNovember -0.05447 0.09654 -0.564 0.574
seasonal_monthDecember -0.13661 0.09221 -1.481 0.141
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.2394 on 142 degrees of freedom
Multiple R-squared: 0.7862, Adjusted R-squared: 0.7681
F-statistic: 43.5 on 12 and 142 DF, p-value: < 2.2e-16
df_merged_w <- add_predictions(ols1w, df_merged_w, "predicted_sales1")
## Model 2A: Experimentation
# Start with the full model (excluding food and bar)
ols2_full_w <- lm(
sales_w ~ numeric_week + seasonal_month + fx_w +
google_w + temp_w + rain_w,
data = df_merged_w
)
# Perform backward stepwise regression
ols2_stepwise_w <- step(
ols2_full_w,
direction = "backward",
trace = 1 # Prints the stepwise regression process
)
Start: AIC=-470.72
sales_w ~ numeric_week + seasonal_month + fx_w + google_w + temp_w +
rain_w
Df Sum of Sq RSS AIC
- google_w 1 0.0047 5.9773 -472.59
- temp_w 1 0.0524 6.0250 -471.36
<none> 5.9726 -470.72
- seasonal_month 11 0.9898 6.9624 -468.95
- rain_w 1 0.1643 6.1370 -468.51
- fx_w 1 1.1370 7.1096 -445.71
- numeric_week 1 16.1784 22.1510 -269.56
Step: AIC=-472.59
sales_w ~ numeric_week + seasonal_month + fx_w + temp_w + rain_w
Df Sum of Sq RSS AIC
- temp_w 1 0.0597 6.0370 -473.05
<none> 5.9773 -472.59
- seasonal_month 11 0.9855 6.9628 -470.94
- rain_w 1 0.1652 6.1426 -470.37
- fx_w 1 1.1490 7.1264 -447.34
- numeric_week 1 19.7232 25.7005 -248.52
Step: AIC=-473.05
sales_w ~ numeric_week + seasonal_month + fx_w + rain_w
Df Sum of Sq RSS AIC
<none> 6.0370 -473.05
- rain_w 1 0.2277 6.2648 -469.31
- seasonal_month 11 1.3008 7.3379 -464.81
- fx_w 1 1.6010 7.6380 -438.59
- numeric_week 1 20.3284 26.3655 -246.56
# Summary of the final stepwise model
summary(ols2_stepwise_w)
Call:
lm(formula = sales_w ~ numeric_week + seasonal_month + fx_w +
rain_w, data = df_merged_w)
Residuals:
Min 1Q Median 3Q Max
-0.5797 -0.1243 0.0106 0.1247 0.5622
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.941702 2.597439 -0.748 0.4560
numeric_week 0.542540 0.024988 21.712 < 2e-16
seasonal_monthFebruary 0.003119 0.082343 0.038 0.9698
seasonal_monthMarch -0.008609 0.081117 -0.106 0.9156
seasonal_monthApril 0.058496 0.084093 0.696 0.4878
seasonal_monthMay 0.160561 0.085157 1.885 0.0614
seasonal_monthJune 0.171277 0.088309 1.940 0.0544
seasonal_monthJuly 0.140843 0.079982 1.761 0.0804
seasonal_monthAugust -0.015579 0.083209 -0.187 0.8518
seasonal_monthSeptember -0.130180 0.082537 -1.577 0.1170
seasonal_monthOctober -0.116443 0.081307 -1.432 0.1543
seasonal_monthNovember -0.011869 0.090153 -0.132 0.8954
seasonal_monthDecember -0.096493 0.081089 -1.190 0.2361
fx_w 2.106952 0.345784 6.093 1.01e-08
rain_w -0.121385 0.052818 -2.298 0.0230
(Intercept)
numeric_week ***
seasonal_monthFebruary
seasonal_monthMarch
seasonal_monthApril
seasonal_monthMay .
seasonal_monthJune .
seasonal_monthJuly .
seasonal_monthAugust
seasonal_monthSeptember
seasonal_monthOctober
seasonal_monthNovember
seasonal_monthDecember
fx_w ***
rain_w *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.2077 on 140 degrees of freedom
Multiple R-squared: 0.8414, Adjusted R-squared: 0.8255
F-statistic: 53.04 on 14 and 140 DF, p-value: < 2.2e-16
# Add predictions from the final stepwise model
df_merged_w <- add_predictions(ols2_stepwise_w, df_merged_w, "predicted_sales2")
# Plot Actual vs Predicted Values
ggplot(df_merged_w, aes(x = week)) +
geom_line(aes(y = exp(sales_w), color = "Actual Sales"), size = 1) +
geom_line(aes(y = exp(predicted_sales0), color = "Model 0"), linetype = "dashed", size = 1) +
geom_line(aes(y = exp(predicted_sales1), color = "Model 1"), linetype = "dotted", size = 1) +
geom_line(aes(y = exp(predicted_sales2), color = "Model 2 Stepwise"), linetype = "dotdash", size = 1) +
labs(title = "Actual vs Predicted Weekly Sales for All Models",
x = "Week", y = "Sales", color = "Legend") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Models to compare
models_w <- list(
"Model trend" = ols0w,
"Model trend + season" = ols1w,
"Model all covariates step" = ols2_stepwise_w
)
# Get R^2 and AIC for each model
model_stats_w <- get_model_stats(models_w)
# View the results
print(model_stats_w)
rmse_stats_w <- data.frame(
Model = character(),
RMSE = numeric(),
stringsAsFactors = FALSE
)
# Loop through each model
for (i in seq_along(models_w)) {
model_name <- names(models_w)[i]
predicted_column <- paste0("predicted_sales", i - 1) # Adjust column name index
# Calculate RMSE on the original scale
rmse <- calculate_rmse(
observed = exp(df_merged_w$sales_w), # Exponentiate actual values
predicted = exp(df_merged_w[[predicted_column]]) # Exponentiate predicted values
)
# Append results to the RMSE stats table
rmse_stats_w <- rbind(rmse_stats_w, data.frame(Model = model_name, RMSE = rmse))
}
# View RMSE statistics
print(rmse_stats_w)
NA
rmse_ols_w <- rmse_stats_w$RMSE[3]
rmse_ols_w
[1] 4000370
# Daily Models
head(df_merged_d,25)
# properly start in december
df_merged_d <- df_merged_d %>%
filter(date > "2021-11-30")
head(df_merged_d)
## Model 0: Trend only
ols0d <- run_model(sales_cop ~ numeric_day, df_merged_d, "Model 0A")
Running Model 0A
Call:
lm(formula = formula, data = data)
Residuals:
Min 1Q Median 3Q Max
-15.2083 -0.1814 0.2027 0.5771 1.6749
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 10.5199 0.4261 24.688 <2e-16 ***
numeric_day 0.6774 0.0695 9.746 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.762 on 1038 degrees of freedom
Multiple R-squared: 0.08383, Adjusted R-squared: 0.08295
F-statistic: 94.98 on 1 and 1038 DF, p-value: < 2.2e-16
df_merged_d <- add_predictions(ols0d, df_merged_d, "predicted_sales0")
## Model 1: Trend + Seasonality
ols1d <- run_model(sales_cop ~ numeric_day + seasonal_month + day_of_week, df_merged_d, "Model 1A")
Running Model 1A
Call:
lm(formula = formula, data = data)
Residuals:
Min 1Q Median 3Q Max
-15.1930 -0.1446 0.1520 0.4321 2.1992
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 10.41648 0.46748 22.282 < 2e-16 ***
numeric_day 0.60222 0.07264 8.290 3.54e-16 ***
seasonal_monthFebruary 0.37785 0.26287 1.437 0.15091
seasonal_monthMarch 0.24291 0.25677 0.946 0.34435
seasonal_monthApril 0.21725 0.26027 0.835 0.40407
seasonal_monthMay 0.03779 0.25763 0.147 0.88342
seasonal_monthJune 0.15499 0.26174 0.592 0.55388
seasonal_monthJuly 0.23454 0.25933 0.904 0.36599
seasonal_monthAugust 0.25260 0.26021 0.971 0.33190
seasonal_monthSeptember -0.03710 0.26437 -0.140 0.88843
seasonal_monthOctober 0.23107 0.26387 0.876 0.38140
seasonal_monthNovember 0.08953 0.29411 0.304 0.76087
seasonal_monthDecember -0.71856 0.25824 -2.783 0.00549 **
day_of_weekTuesday -0.06744 0.19830 -0.340 0.73385
day_of_weekWednesday 0.22634 0.19767 1.145 0.25247
day_of_weekThursday 0.47521 0.19734 2.408 0.01621 *
day_of_weekFriday 1.10918 0.19862 5.584 3.01e-08 ***
day_of_weekSaturday 0.93769 0.19827 4.729 2.57e-06 ***
day_of_weekSunday 0.61578 0.19826 3.106 0.00195 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.705 on 1021 degrees of freedom
Multiple R-squared: 0.1557, Adjusted R-squared: 0.1408
F-statistic: 10.46 on 18 and 1021 DF, p-value: < 2.2e-16
df_merged_d <- add_predictions(ols1d, df_merged_d, "predicted_sales1")
# Model 2: Backward
head(df_merged_d)
# Start with the full model (excluding food and bar)
ols2_full_d <- lm(
sales_cop ~ numeric_day + seasonal_month + day_of_week + fx +
tmedian + rain_sum,
data = df_merged_d
)
summary(ols2_full_d)
Call:
lm(formula = sales_cop ~ numeric_day + seasonal_month + day_of_week +
fx + tmedian + rain_sum, data = df_merged_d)
Residuals:
Min 1Q Median 3Q Max
-15.2560 -0.1301 0.1434 0.4180 2.2430
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.681458 7.563025 0.487 0.62653
numeric_day 0.555771 0.078969 7.038 3.58e-12
seasonal_monthFebruary 0.409437 0.263765 1.552 0.12091
seasonal_monthMarch 0.254867 0.257099 0.991 0.32176
seasonal_monthApril 0.215634 0.264563 0.815 0.41523
seasonal_monthMay 0.002179 0.263006 0.008 0.99339
seasonal_monthJune 0.125315 0.268566 0.467 0.64088
seasonal_monthJuly 0.230188 0.260048 0.885 0.37627
seasonal_monthAugust 0.284052 0.260701 1.090 0.27616
seasonal_monthSeptember 0.004766 0.265632 0.018 0.98569
seasonal_monthOctober 0.152011 0.266104 0.571 0.56796
seasonal_monthNovember -0.092994 0.303301 -0.307 0.75921
seasonal_monthDecember -0.744359 0.258962 -2.874 0.00413
day_of_weekTuesday -0.063129 0.198020 -0.319 0.74994
day_of_weekWednesday 0.237591 0.197534 1.203 0.22934
day_of_weekThursday 0.471793 0.197065 2.394 0.01684
day_of_weekFriday 1.102140 0.198329 5.557 3.50e-08
day_of_weekSaturday 0.930391 0.198011 4.699 2.98e-06
day_of_weekSunday 0.615126 0.198134 3.105 0.00196
fx 0.787878 0.880843 0.894 0.37129
tmedian -0.143825 0.999347 -0.144 0.88559
rain_sum 0.137507 0.101046 1.361 0.17387
(Intercept)
numeric_day ***
seasonal_monthFebruary
seasonal_monthMarch
seasonal_monthApril
seasonal_monthMay
seasonal_monthJune
seasonal_monthJuly
seasonal_monthAugust
seasonal_monthSeptember
seasonal_monthOctober
seasonal_monthNovember
seasonal_monthDecember **
day_of_weekTuesday
day_of_weekWednesday
day_of_weekThursday *
day_of_weekFriday ***
day_of_weekSaturday ***
day_of_weekSunday **
fx
tmedian
rain_sum
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.703 on 1018 degrees of freedom
Multiple R-squared: 0.1609, Adjusted R-squared: 0.1436
F-statistic: 9.294 on 21 and 1018 DF, p-value: < 2.2e-16
# Perform backward stepwise regression
ols2_stepwise_d <- step(
ols2_full_d,
direction = "backward",
trace = 1 # Prints the stepwise regression process
)
Start: AIC=1128.63
sales_cop ~ numeric_day + seasonal_month + day_of_week + fx +
tmedian + rain_sum
Df Sum of Sq RSS AIC
- tmedian 1 0.060 2951.0 1126.7
- fx 1 2.319 2953.3 1127.5
- rain_sum 1 5.368 2956.3 1128.5
<none> 2951.0 1128.6
- seasonal_month 11 79.118 3030.1 1134.2
- numeric_day 1 143.580 3094.6 1176.0
- day_of_week 6 175.600 3126.6 1176.8
Step: AIC=1126.65
sales_cop ~ numeric_day + seasonal_month + day_of_week + fx +
rain_sum
Df Sum of Sq RSS AIC
- fx 1 2.277 2953.3 1125.5
<none> 2951.0 1126.7
- rain_sum 1 6.045 2957.1 1126.8
- seasonal_month 11 79.429 3030.5 1132.3
- day_of_week 6 175.581 3126.6 1174.8
- numeric_day 1 146.691 3097.7 1175.1
Step: AIC=1125.45
sales_cop ~ numeric_day + seasonal_month + day_of_week + rain_sum
Df Sum of Sq RSS AIC
<none> 2953.3 1125.5
- rain_sum 1 15.758 2969.1 1129.0
- seasonal_month 11 79.049 3032.4 1130.9
- numeric_day 1 144.596 3097.9 1173.2
- day_of_week 6 175.164 3128.5 1173.4
# Summary of the final stepwise model
summary(ols2_stepwise_d)
Call:
lm(formula = sales_cop ~ numeric_day + seasonal_month + day_of_week +
rain_sum, data = df_merged_d)
Residuals:
Min 1Q Median 3Q Max
-15.1816 -0.1390 0.1437 0.4305 2.3591
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 9.54422 0.59782 15.965 < 2e-16 ***
numeric_day 0.54288 0.07682 7.067 2.93e-12 ***
seasonal_monthFebruary 0.42246 0.26300 1.606 0.10851
seasonal_monthMarch 0.24004 0.25622 0.937 0.34905
seasonal_monthApril 0.17428 0.26036 0.669 0.50339
seasonal_monthMay -0.03250 0.25883 -0.126 0.90009
seasonal_monthJune 0.07827 0.26324 0.297 0.76626
seasonal_monthJuly 0.21366 0.25892 0.825 0.40946
seasonal_monthAugust 0.27803 0.25988 1.070 0.28494
seasonal_monthSeptember 0.01683 0.26481 0.064 0.94933
seasonal_monthOctober 0.17391 0.26444 0.658 0.51091
seasonal_monthNovember -0.08671 0.30304 -0.286 0.77483
seasonal_monthDecember -0.75067 0.25804 -2.909 0.00370 **
day_of_weekTuesday -0.06287 0.19788 -0.318 0.75076
day_of_weekWednesday 0.24147 0.19735 1.224 0.22139
day_of_weekThursday 0.47065 0.19692 2.390 0.01703 *
day_of_weekFriday 1.10202 0.19821 5.560 3.45e-08 ***
day_of_weekSaturday 0.92892 0.19788 4.694 3.04e-06 ***
day_of_weekSunday 0.61671 0.19783 3.117 0.00188 **
rain_sum 0.18939 0.08118 2.333 0.01985 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.702 on 1020 degrees of freedom
Multiple R-squared: 0.1602, Adjusted R-squared: 0.1446
F-statistic: 10.24 on 19 and 1020 DF, p-value: < 2.2e-16
# Add predictions from the final stepwise model
df_merged_d <- add_predictions(ols2_stepwise_d, df_merged_d, "predicted_sales2")
# Plot Actual vs Predicted Values
ggplot(df_merged_d, aes(x = date)) +
geom_line(aes(y = exp(sales_cop), color = "Actual Sales"), size = 1) +
geom_line(aes(y = exp(predicted_sales0), color = "Model 0"), linetype = "dashed", size = 1) +
geom_line(aes(y = exp(predicted_sales1), color = "Model 1"), linetype = "dotted", size = 1) +
geom_line(aes(y = exp(predicted_sales2), color = "Model 2 Stepwise"), linetype = "dotdash", size = 1) +
labs(title = "Actual vs Predicted Sales for All Models",
x = "date", y = "Sales", color = "Legend") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Models to compare
models_d <- list(
"Model trend" = ols0d,
"Model trend + season" = ols1d,
"Model all covariates step" = ols2_stepwise_d
)
# Get R^2 and AIC for each model
model_stats_d <- get_model_stats(models_d)
# View the results
print(model_stats_d)
# RMSE calculation for the original (exponentiated) scale for daily models
rmse_stats_d <- data.frame(
Model = character(),
RMSE = numeric(),
stringsAsFactors = FALSE
)
# Loop through each model
for (i in seq_along(models_d)) {
model_name <- names(models_d)[i]
predicted_column <- paste0("predicted_sales", i - 1) # Adjust column name index
# Calculate RMSE on the original scale
rmse <- calculate_rmse(
observed = exp(df_merged_d$sales_cop), # Exponentiate actual values
predicted = exp(df_merged_d[[predicted_column]]) # Exponentiate predicted values
)
# Append results to the RMSE stats table
rmse_stats_d <- rbind(rmse_stats_d, data.frame(Model = model_name, RMSE = rmse))
}
# View RMSE statistics for daily data
print(rmse_stats_d)
rmse_ols_d <- rmse_stats_d$RMSE[3]
rmse_ols_d
[1] 1421301
Here we explore non linear models, starting from the simplest to more elaborate models in the end combining some of the used models.
The time series are altered so the visualizations are more understandable, basically we change the date index in the timeseries objects
# re-declare time-series beacause we droped some rows:
# Ensure the 'date' columns are in Date format
df_merged_d$date <- as.Date(df_merged_d$date)
df_merged_w$date <- as.Date(df_merged_w$week)
df_merged_m$date <- as.Date(df_merged_m$month)
# Extract the start date and year for each dataframe
start_d <- min(df_merged_d$date)
start_w <- min(df_merged_w$date)
start_m <- min(df_merged_m$date)
# Extract components for daily, weekly, and monthly start times
start_d_year <- as.numeric(format(start_d, "%Y"))
start_d_day <- as.numeric(format(start_d, "%j")) # Day of the year
start_w_year <- as.numeric(format(start_w, "%Y"))
start_w_week <- as.numeric(format(start_w, "%U")) + 1 # Week number, adding 1 since R starts at week 0
start_m_year <- as.numeric(format(start_m, "%Y"))
start_m_month <- as.numeric(format(start_m, "%m"))
# Declare time series with appropriate frequencies
sales_d_ts <- ts(exp(df_merged_d$sales_cop), start = c(start_d_year, start_d_day), frequency = 365)
sales_w_ts <- ts(exp(df_merged_w$sales_w), start = c(start_w_year, start_w_week), frequency = 52)
sales_m_ts <- ts(exp(df_merged_m$sales_m), start = c(start_m_year, start_m_month), frequency = 12)
food_d_ts <- ts(exp(df_merged_d$food), start = c(start_d_year, start_d_day), frequency = 365)
food_w_ts <- ts(exp(df_merged_w$food_w), start = c(start_w_year, start_w_week), frequency = 52)
food_m_ts <- ts(exp(df_merged_m$food_m), start = c(start_m_year, start_m_month), frequency = 12)
bar_d_ts <- ts(exp(df_merged_d$bar), start = c(start_d_year, start_d_day), frequency = 365)
bar_w_ts <- ts(exp(df_merged_w$bar_w), start = c(start_w_year, start_w_week), frequency = 52)
bar_m_ts <- ts(exp(df_merged_m$bar_m), start = c(start_m_year, start_m_month), frequency = 12)
# Verify the created time series
par(mfrow=c(1,1))
plot(sales_d_ts)
plot(sales_w_ts)
plot(sales_m_ts)
plot(food_d_ts)
plot(food_w_ts)
plot(food_m_ts)
plot(bar_d_ts)
plot(bar_w_ts)
plot(bar_m_ts)
Here we fill the sales = 0 values with the mean of the two adjacent dates. This in order to have smoother models. The dates with sales = 0 are dates that are national holiday like christmas or new years, or inventory day in which the kitchen cannot operate so the sales are 0.
# Function to replace 1s with the mean of previous and next observations
fill_ones <- function(ts_data) {
# Convert time series to numeric vector
ts_vec <- as.numeric(ts_data)
# Loop through and replace 1s
for (i in seq_along(ts_vec)) {
if (ts_vec[i] == 1) {
# Check boundaries to avoid indexing issues
prev_val <- ifelse(i > 1, ts_vec[i - 1], NA)
next_val <- ifelse(i < length(ts_vec), ts_vec[i + 1], NA)
# Replace with mean of previous and next, ignoring NA
ts_vec[i] <- mean(c(prev_val, next_val), na.rm = TRUE)
}
}
# Return as time series with original attributes
ts(ts_vec, start = start(ts_data), frequency = frequency(ts_data))
}
# Apply the function
sales_d_ts <- fill_ones(sales_d_ts)
sales_w_ts <- fill_ones(sales_w_ts)
sales_m_ts <- fill_ones(sales_m_ts)
food_d_ts <- fill_ones(food_d_ts)
food_w_ts <- fill_ones(food_w_ts)
food_m_ts <- fill_ones(food_m_ts)
bar_d_ts <- fill_ones(bar_d_ts)
bar_w_ts <- fill_ones(bar_w_ts)
bar_m_ts <- fill_ones(bar_m_ts)
# Some simple plots
plot(sales_m_ts)
plot(cumsum(sales_m_ts)) #Returns a vector whose elements are the cumulative sums
# Bass model
bm_m<-BM(sales_m_ts,display = T) # show graphical view of results / display = True
summary(bm_m)
Call: ( Standard Bass Model )
BM(series = sales_m_ts, display = T)
Residuals:
Min. 1st Qu. Median Mean 3rd Qu. Max.
-94382506 -37311459 -12636337 -10394220 24066002 75506534
Coefficients:
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error 47282474 on 33 degrees of freedom
Multiple R-squared: 0.999374 Residual sum of squares: 7.377587e+16
bm_m$coefficients['m'] - sum(sales_m_ts)
m
1148560231
According to this, there are only 1m cop left to sell, this is less than a year / seems wrong. Fits well but the 30- onward is wierd + sales might not be declining yet. Still reflects the innovation and copying in some sense
Also the restaurants rely in word of mouth to reach full stage m = 4.664.000.000 COP, i.e 1 mm EUR approx. / The restaurant has sold 3.515.788.885/ According to this only in 1 year it should extinguish sells p, innovation: 0.832% indicates that the adoption rate due to external influence is relatively low, but not uncommon for many markets. - it is actually relativly innovative q: (8.96%) suggests that imitation plays a larger role than innovation in driving adoption in this market
pred_bm_m<- predict(bm_m, newx=c(1:length(sales_m_ts)))
pred_bm_m <- ts(pred_bm_m, start = start(sales_m_ts), frequency = frequency(sales_m_ts))
pred.inst_bm_m <- make.instantaneous(pred_bm_m)
pred.inst_bm_m <- ts(pred.inst_bm_m, start = start(sales_m_ts), frequency = frequency(sales_m_ts))
# plot
plot(sales_m_ts, type = "p", col = "black", pch = 16, cex = 0.7,
xlab = "Month", ylab = "Monthly Sales", main = "Actual vs Fitted Sales")
# Add the fitted values as a line
lines(pred.inst_bm_m, col = "red", lwd = 2)
# Add a legend
legend("topleft", legend = c("Actual Values", "Fitted Values"),
col = c("black", "red"), pch = c(16, NA), lty = c(NA, 1), lwd = c(NA, 2))
# check residuals
res_bm_m <- sales_m_ts - pred.inst_bm_m
tsdisplay(res_bm_m)
Residuals have some structure and 2 lag has correlation.
# Calculate RMSE for Bass Model predictions
rmse_bm_m <- calculate_rmse(observed = sales_m_ts, predicted = pred.inst_bm_m)
# Print the RMSE
cat("RMSE for Bass Model Predictions:", rmse_bm_m, "\n")
RMSE for Bass Model Predictions: 18498870
bm_w<-BM(sales_w_ts,display = T) # show graphical view of results / display = True
summary(bm_w)
Call: ( Standard Bass Model )
BM(series = sales_w_ts, display = T)
Residuals:
Min. 1st Qu. Median Mean 3rd Qu. Max.
-90378130 -33009033 -13970791 -8913677 19483237 72394978
Coefficients:
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error 40676860 on 152 degrees of freedom
Multiple R-squared: 0.999528 Residual sum of squares: 2.515003e+17
bm_w$coefficients['m'] - sum(sales_w_ts)
m
1178496087
# results are similar in terms of m, p and w are in other scale
#because they are in different time stamp
bm_m$coefficients['q'] / bm_w$coefficients['q'] # they are approx 4 times
q
4.506878
bm_m$coefficients['p'] / bm_w$coefficients['p'] # they are approx 4 times
p
4.010805
# which makes sense
Coefficients are approximatly 4 times the ones of the monthly model, making sense because there are 4 weeks in a month. While market potential is similar.
# Prediction
pred_bm_w<- predict(bm_w, newx=c(1:length(sales_w_ts)))
pred_bm_w <- ts(pred_bm_w, start = start(sales_w_ts), frequency = frequency(sales_w_ts))
pred.inst_bm_w <- make.instantaneous(pred_bm_w)
pred.inst_bm_w <- ts(pred.inst_bm_w, start = start(sales_w_ts), frequency = frequency(sales_w_ts))
# plot
plot(sales_w_ts, type = "p", col = "black", pch = 16, cex = 0.7,
xlab = "Week", ylab = "Weekly Sales", main = "Actual vs Fitted Sales")
# Add the fitted values as a line
lines(pred.inst_bm_w, col = "red", lwd = 2)
# Add a legend
legend("topleft", legend = c("Actual Values", "Fitted Values"),
col = c("black", "red"), pch = c(16, NA), lty = c(NA, 1), lwd = c(NA, 2))
# check residuals
res_bm_w <- sales_w_ts - pred.inst_bm_w
tsdisplay(res_bm_w)
Residuals have some structure and 2 lag has correlation, with clear trend and structure in the residuals
# RMSE
# Calculate RMSE for Bass Model predictions
rmse_bm_w <- calculate_rmse(observed = sales_w_ts, predicted = pred.inst_bm_w)
# Print the RMSE
cat("RMSE for Bass Model Predictions:", rmse_bm_w, "\n")
RMSE for Bass Model Predictions: 4542019
bm_d <- BM(
sales_d_ts,
prelimestimates = c(1.2 * sum(sales_d_ts), 0.005, 0.5), # Adjust these estimates
display = TRUE
)
summary(bm_d)
Call: ( Standard Bass Model )
BM(series = sales_d_ts, prelimestimates = c(1.2 * sum(sales_d_ts),
0.005, 0.5), display = TRUE)
Residuals:
Min. 1st Qu. Median Mean 3rd Qu. Max.
-81086932 -29957254 -10704557 -7197438 17139516 79381813
Coefficients:
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error 34753453 on 1037 degrees of freedom
Multiple R-squared: 0.999642 Residual sum of squares: 1.252491e+18
bm_d$coefficients['m'] - sum(sales_d_ts)
m
1178708770
# results are similar in terms of m, p and w are in other scale
#because they are in different time stamp
bm_w$coefficients['q'] / bm_d$coefficients['q'] # they are approx 7 times
q
6.987921
bm_w$coefficients['p'] / bm_d$coefficients['p'] # they are approx 7 times
p
6.375449
Coefficients are approximately 1:7 scale of the ones in the weekly model, making sense. The market potential is also similar in order of magnitude.
# Prediction
pred_bm_d <- predict(bm_d, newx = c(1:length(sales_d_ts)))
pred_bm_d <- ts(pred_bm_d, start = start(sales_d_ts), frequency = frequency(sales_d_ts))
pred.inst_bm_d <- make.instantaneous(pred_bm_d)
pred.inst_bm_d <- ts(pred.inst_bm_d, start = start(sales_d_ts), frequency = frequency(sales_d_ts))
# Plot actual vs fitted sales for daily data
plot(sales_d_ts, type = "p", col = "black", pch = 16, cex = 0.7,
xlab = "Day", ylab = "Daily Sales", main = "Actual vs Fitted Sales (Daily)")
# Add the fitted values as a line
lines(pred.inst_bm_d, col = "red", lwd = 2)
# Add a legend
legend("topleft", legend = c("Actual Values", "Fitted Values"),
col = c("black", "red"), pch = c(16, NA), lty = c(NA, 1), lwd = c(NA, 2))
# Check residuals
res_bm_d <- sales_d_ts - pred.inst_bm_d
tsdisplay(res_bm_d)
Residuals don not seem stationary, or at least they have a lot of autocorrelation.
# Calculate RMSE for Bass Model predictions (daily data)
rmse_bm_d <- calculate_rmse(observed = sales_d_ts, predicted = pred.inst_bm_d)
# Print the RMSE
cat("RMSE for Daily Bass Model Predictions:", rmse_bm_d, "\n")
RMSE for Daily Bass Model Predictions: 1651896
Bass model assumes that every product succeeds and the sales saturate to the steady state level. However, most new products fail in reality.
The market potential m is constant along the whole life cycle.
Bass model predictions works well only after the scale inflection point. if sales of a category goes up and up like a J-curve, it can over estimate the overall market size.
It is a model for products with a limited life cycle: needs a hypothesis.
Another drawback of Bass model is that the diffusion pattern in not affected by marketing mix variables like price or advertising.
The generalized Bass model extends the original Bass model allowing the roles of marketing mix value.
Bass model is used to forecast the adoption of a new product and to predict the sales, since it determines the shape of the curve of a model that represent the cumulative adoption of a new product. The Generalized Bass model extends the original Bass model by incorporating marketing mix variables. We can know the effect of pricing, promotions on the new product diffusion curve. It is more flexible than the original Bass model.
m <- 4.451570e+09
p <- 8.472917e-03
q <- 9.415625e-02
GBM_monthly_sales <- GBM(
sales_m_ts,
shock = 'exp',
nshock = 1,
#prelimestimates = c(m,p,q, 12, 0.1, -0.1)
prelimestimates = c(m,p,q, 10, 0.1, 2)
#prelimestimates = c(m,p,q, 11, 15, -0.1)
)
summary(GBM_monthly_sales)
Call: ( Generalized Bass model with 1 Exponential shock )
GBM(series = sales_m_ts, shock = "exp", nshock = 1, prelimestimates = c(m,
p, q, 10, 0.1, 2))
Residuals:
Min. 1st Qu. Median Mean 3rd Qu. Max.
-27853732 -10753200 -127747 -619963 6529609 25477494
Coefficients:
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error 13902026 on 30 degrees of freedom
Multiple R-squared: 0.999866 Residual sum of squares: 5.79799e+15
pred_GBM_monthly_sales<- predict(GBM_monthly_sales, newx=c(1:60))
pred_GBM_monthly_sales.inst<- make.instantaneous(pred_GBM_monthly_sales)
# Montly model
ggm1 <- GGM(sales_m_ts, mt='base', display = T)
Warning: NaNs producedWarning: NaNs producedWarning: NaNs producedWarning: NaNs producedWarning: NaNs producedWarning: NaNs produced
ggm2 <- GGM(sales_m_ts, mt= function(x) pchisq(x,10),display = T)
summary(ggm1)
Call: ( Guseo Guidolin Model )
GGM(series = sales_m_ts, mt = "base", display = T)
Residuals:
Min. 1st Qu. Median Mean 3rd Qu. Max.
-27544719 -8809035 742251 -337148 7701034 23238738
Coefficients:
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error 13318300 on 31 degrees of freedom
Multiple R-squared: 0.999873 Residual sum of squares: 5.498691e+15
summary(ggm2)
Call: ( Guseo Guidolin Model )
GGM(series = sales_m_ts, mt = function(x) pchisq(x, 10), display = T)
Residuals:
Min. 1st Qu. Median Mean 3rd Qu. Max.
-55479254 -21799576 5818484 25726804 63528263 144216338
Coefficients:
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error 64933559 on 33 degrees of freedom
Multiple R-squared: 0.99678 Residual sum of squares: 1.391401e+17
# try different functions for market potential
ggm3 <- GGM(sales_m_ts, mt= function(x) log(x),display = T)
ggm4 <- GGM(sales_m_ts, mt= function(x) (x)**(1/1.05),display = T)
summary(ggm3)
Call: ( Guseo Guidolin Model )
GGM(series = sales_m_ts, mt = function(x) log(x), display = T)
Residuals:
Min. 1st Qu. Median Mean 3rd Qu. Max.
-51543377 -17143219 -1444208 -3434599 7197387 38682408
Coefficients:
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error 23487767 on 33 degrees of freedom
Multiple R-squared: 0.999579 Residual sum of squares: 1.820528e+16
summary(ggm4)
Call: ( Guseo Guidolin Model )
GGM(series = sales_m_ts, mt = function(x) (x)^(1/1.05), display = T)
Residuals:
Min. 1st Qu. Median Mean 3rd Qu. Max.
-33877044 -9900757 -986705 -1517515 10675797 29488516
Coefficients:
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error 16905824 on 33 degrees of freedom
Multiple R-squared: 0.999782 Residual sum of squares: 9.431627e+15
K <- 7.683785e+09
pc <- 2.698613e-02
qc <- 2.582412e-01
ps <- 7.731763e-03
qs <- 4.508202e-02
# predictions
pred_ggm_m <- predict(ggm1, newx = c(1:length(sales_m_ts)))
pred_ggm_m <- ts(pred_ggm_m, start = start(sales_m_ts), frequency = frequency(sales_m_ts))
pred.inst_ggm_m <- make.instantaneous(pred_ggm_m)
pred.inst_ggm_m <- ts(pred.inst_ggm_m, start = start(sales_m_ts), frequency = frequency(sales_m_ts))
# Plot actual vs fitted sales for monthly data
plot(sales_m_ts, type = "p", col = "black", pch = 16, cex = 0.7,
xlab = "Month", ylab = "Monthly Sales", main = "Actual vs Fitted Sales (GGM Model)")
# Add the fitted values as a line
lines(pred.inst_ggm_m, col = "red", lwd = 2)
# Add a legend
legend("topleft", legend = c("Actual Values", "Fitted Values (GGM)"),
col = c("black", "red"), pch = c(16, NA), lty = c(NA, 1), lwd = c(NA, 2))
#Analysis of residuals
res_GGM_m<- sales_m_ts - pred.inst_ggm_m
tsdisplay(res_GGM_m)
Residuals look stationary for this model
# Residuals somehow are kind of stationary
# check for stationarity of residuals
adf_test <- adf.test(res_GGM_m)
print(adf_test) # if p-val < alpha, series stationary
Augmented Dickey-Fuller Test
data: res_GGM_m
Dickey-Fuller = -4.1, Lag order = 3, p-value = 0.01708
alternative hypothesis: stationary
# so with this model we achieve stationary series
# check for autocorrelation in residuals
Box.test(res_GGM_m, lag = 10, type = "Ljung-Box") # h0 res indep
Box-Ljung test
data: res_GGM_m
X-squared = 16.263, df = 10, p-value = 0.09234
# p-val > alpha => fail to reject h0, so residuals seem indep
Residuals are likeley stationary
# Calculate RMSE for ggm1
rmse_ggm1 <- calculate_rmse(observed = sales_m_ts, predicted = pred.inst_ggm_m)
# Print RMSE for ggm1
cat("RMSE for GGM Model 1 (Base):", rmse_ggm1, "\n")
RMSE for GGM Model 1 (Base): 11759505
# Weekly
ggm1_w <- GGM(sales_w_ts, mt='base', display = T)
ggm2_w <- GGM(sales_w_ts, mt= function(x) pchisq(x,25),display = T)
summary(ggm1_w) # this one is better
Call: ( Guseo Guidolin Model )
GGM(series = sales_w_ts, mt = "base", display = T)
Residuals:
Min. 1st Qu. Median Mean 3rd Qu. Max.
-30941134 -8439496 1818273 88704 8469414 21113528
Coefficients:
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error 12271825 on 150 degrees of freedom
Multiple R-squared: 0.999876 Residual sum of squares: 2.258965e+16
summary(ggm2_w)
Call: ( Guseo Guidolin Model )
GGM(series = sales_w_ts, mt = function(x) pchisq(x, 25), display = T)
Residuals:
Min. 1st Qu. Median Mean 3rd Qu. Max.
-34347002 -18821378 -462825 7370123 29484361 96549738
Coefficients:
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error 33658058 on 152 degrees of freedom
Multiple R-squared: 0.999055 Residual sum of squares: 1.721955e+17
# try different functions for market potential
ggm3_w <- GGM(sales_w_ts, mt= function(x) log(x),display = T)
ggm4_w <- GGM(sales_w_ts, mt= function(x) (x)**(1/1.05),display = T)
summary(ggm3_w)
Call: ( Guseo Guidolin Model )
GGM(series = sales_w_ts, mt = function(x) log(x), display = T)
Residuals:
Min. 1st Qu. Median Mean 3rd Qu. Max.
-62209555 -22307652 -3737433 -4861230 11043715 47916399
Coefficients:
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error 26307052 on 152 degrees of freedom
Multiple R-squared: 0.999423 Residual sum of squares: 1.051933e+17
summary(ggm4_w) # better shaped but less significant
Call: ( Guseo Guidolin Model )
GGM(series = sales_w_ts, mt = function(x) (x)^(1/1.05), display = T)
Residuals:
Min. 1st Qu. Median Mean 3rd Qu. Max.
-34605136 -10571355 2199183 -413132 8482721 26917353
Coefficients:
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error 14257525 on 152 degrees of freedom
Multiple R-squared: 0.99983 Residual sum of squares: 3.089811e+16
# predictions
pred_ggm_w <- predict(ggm1_w, newx = c(1:length(sales_w_ts)))
pred_ggm_w <- ts(pred_ggm_w, start = start(sales_w_ts), frequency = frequency(sales_w_ts))
pred.inst_ggm_w <- make.instantaneous(pred_ggm_w)
pred.inst_ggm_w <- ts(pred.inst_ggm_w, start = start(sales_w_ts), frequency = frequency(sales_w_ts))
# Plot actual vs fitted sales for weekly data
plot(sales_w_ts, type = "p", col = "black", pch = 16, cex = 0.7,
xlab = "Week", ylab = "Weekly Sales", main = "Actual vs Fitted Sales (GGM Model)")
# Add the fitted values as a line
lines(pred.inst_ggm_w, col = "red", lwd = 2)
# Add a legend
legend("topleft", legend = c("Actual Values", "Fitted Values (GGM)"),
col = c("black", "red"), pch = c(16, NA), lty = c(NA, 1), lwd = c(NA, 2))
# Analysis of residuals
res_GGM_w <- sales_w_ts - pred.inst_ggm_w
tsdisplay(res_GGM_w)
# Check for stationarity of residuals
adf_test_w <- adf.test(res_GGM_w)
Warning: p-value smaller than printed p-value
print(adf_test_w) # if p-value < alpha, series is stationary
Augmented Dickey-Fuller Test
data: res_GGM_w
Dickey-Fuller = -4.3175, Lag order = 5, p-value = 0.01
alternative hypothesis: stationary
# Check for autocorrelation in residuals
box_test_w <- Box.test(res_GGM_w, lag = 10, type = "Ljung-Box")
print(box_test_w) # if p-value > alpha, residuals are independent
Box-Ljung test
data: res_GGM_w
X-squared = 55.579, df = 10, p-value = 2.462e-08
Series is stationary according to tests, but clearly has strong autocorrelation
# RMSE
rmse_ggm_w <- calculate_rmse(observed = sales_w_ts, predicted = pred.inst_ggm_w)
# Print the RMSE
cat("RMSE for Weekly GGM Model Predictions:", rmse_ggm_w, "\n")
RMSE for Weekly GGM Model Predictions: 3453199
# Daily GGM
# Scaling the sales data
sales_min <- min(sales_d_ts)
sales_max <- max(sales_d_ts)
sales_scaled <- (sales_d_ts - sales_min) / (sales_max - sales_min)
# View scaled data
summary(sales_scaled)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0000 0.1904 0.2867 0.3217 0.4343 1.0000
plot(sales_scaled, type = "l", main = "Scaled Daily Sales", xlab = "Day", ylab = "Scaled Sales")
We re-scale the data because else the model won’t converge
# Fit GGM models using scaled data
ggm1_d <- GGM(sales_scaled, mt = 'base', display = T)
Warning: NaNs producedWarning: NaNs producedWarning: NaNs produced
ggm2_d <- GGM(sales_scaled, mt = function(x) pchisq(x, 10), display = T)
ggm3_d <- GGM(sales_scaled, mt = function(x) log(x), display = T)
ggm4_d <- GGM(sales_scaled, mt = function(x) (x)^(1/1.05), display = T)
# Summarize models
summary(ggm1_d) # Base model
Call: ( Guseo Guidolin Model )
GGM(series = sales_scaled, mt = "base", display = T)
Residuals:
Min. 1st Qu. Median Mean 3rd Qu. Max.
-3.40144 -0.89220 0.18363 0.04369 0.98854 2.48411
Coefficients:
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error 1.202014 on 1035 degrees of freedom
Multiple R-squared: 0.999862 Residual sum of squares: 1495.407
summary(ggm2_d) # Chi-squared
Call: ( Guseo Guidolin Model )
GGM(series = sales_scaled, mt = function(x) pchisq(x, 10), display = T)
Residuals:
Min. 1st Qu. Median Mean 3rd Qu. Max.
-7.8650 -2.9043 -0.9269 -0.6900 1.6614 7.6964
Coefficients:
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error 3.369694 on 1037 degrees of freedom
Multiple R-squared: 0.998917 Residual sum of squares: 11774.97
summary(ggm3_d) # Log transformation
Call: ( Guseo Guidolin Model )
GGM(series = sales_scaled, mt = function(x) log(x), display = T)
Residuals:
Min. 1st Qu. Median Mean 3rd Qu. Max.
-6.0614 -2.1300 -0.4001 -0.4470 1.0521 5.8832
Coefficients:
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error 2.489536 on 1037 degrees of freedom
Multiple R-squared: 0.999409 Residual sum of squares: 6427.109
summary(ggm4_d) # Power transformation
Call: ( Guseo Guidolin Model )
GGM(series = sales_scaled, mt = function(x) (x)^(1/1.05), display = T)
Residuals:
Min. 1st Qu. Median Mean 3rd Qu. Max.
-3.33513 -0.98706 0.18913 0.07835 0.93237 3.00796
Coefficients:
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error 1.307058 on 1037 degrees of freedom
Multiple R-squared: 0.999837 Residual sum of squares: 1771.61
Predict on the best model based on fit and p-values We select model 1
# Prediction using GGM model
pred_ggm_d <- predict(ggm1_d, newx = c(1:length(sales_scaled)))
pred_ggm_d <- ts(pred_ggm_d, start = start(sales_scaled), frequency = frequency(sales_scaled))
pred.inst_ggm_d <- make.instantaneous(pred_ggm_d)
pred.inst_ggm_d <- ts(pred.inst_ggm_d, start = start(sales_scaled), frequency = frequency(sales_scaled))
# Re-scale predictions back to the original scale
pred_original_scale <- (pred.inst_ggm_d * (sales_max - sales_min)) + sales_min
# Plot actual vs fitted sales (original scale)
plot(sales_d_ts, type = "p", col = "black", pch = 16, cex = 0.7,
xlab = "Day", ylab = "Daily Sales", main = "Actual vs Fitted Sales (Original Scale)")
lines(pred_original_scale, col = "red", lwd = 2)
legend("topleft", legend = c("Actual Values", "Fitted Values (GGM, Original Scale)"),
col = c("black", "red"), pch = c(16, NA), lty = c(NA, 1), lwd = c(NA, 2))
# Analysis of residuals
res_GGM_d <- sales_d_ts - pred_original_scale
tsdisplay(res_GGM_d, main = "Residuals of GGM Model")
Residuals dont look stationary
# Check for stationarity of residuals
adf_test_d <- adf.test(res_GGM_d)
Warning: p-value smaller than printed p-value
print(adf_test_d) # If p-value < alpha, series is stationary
Augmented Dickey-Fuller Test
data: res_GGM_d
Dickey-Fuller = -7.8042, Lag order = 10, p-value = 0.01
alternative hypothesis: stationary
# according to this, they are stationary
# Check for autocorrelation in residuals
box_test_d <- Box.test(res_GGM_d, lag = 10, type = "Ljung-Box")
print(box_test_d) # If p-value > alpha, residuals are indep
Box-Ljung test
data: res_GGM_d
X-squared = 1403.5, df = 10, p-value < 2.2e-16
Residuals look stationary in the test but hey have serial correlation
# Calculate RMSE for GGM model predictions (original scale)
rmse_ggm_d <- calculate_rmse(observed = sales_d_ts, predicted = pred_original_scale)
# Print the RMSE
cat("RMSE for Daily GGM Model Predictions (Original Scale):", rmse_ggm_d, "\n")
RMSE for Daily GGM Model Predictions (Original Scale): 1600510
# adjust timeseries to ensure date consistency
sales_m_ts <- ts(sales_m_ts, frequency=12, start=c(2021, 11))
hw1_m<- hw(sales_m_ts, seasonal="additive")
hw2_m<- hw(sales_m_ts, seasonal="multiplicative")
# prediction
fitted_hw1 <- hw1_m$fitted
fitted_hw2 <- hw2_m$fitted
We now plot the models
# Create a data frame for ggplot
plot_data <- data.frame(
Time = time(sales_m_ts),
Actual = as.numeric(sales_m_ts),
Fitted_Additive = as.numeric(hw1_m$fitted),
Fitted_Multiplicative = as.numeric(hw2_m$fitted)
)
# Melt data for easier ggplot usage
library(reshape2)
plot_data_melted <- melt(plot_data, id.vars = "Time",
variable.name = "Series",
value.name = "Value")
# Plot using ggplot2
ggplot(plot_data_melted, aes(x = Time, y = Value, color = Series)) +
geom_point(data = subset(plot_data_melted, Series == "Actual"), size = 2) + # Actual values as dots
geom_line(data = subset(plot_data_melted, Series != "Actual"), size = 1) + # Fitted values as lines
labs(
title = "Actual vs Fitted Values",
x = "Time",
y = "Value",
color = "Series"
) +
scale_color_manual(
values = c("Actual" = "black", "Fitted_Additive" = "blue", "Fitted_Multiplicative" = "red"),
labels = c("Actual", "Fitted (Additive)", "Fitted (Multiplicative)")
) +
theme_minimal() +
theme(
legend.position = "top",
legend.title = element_text(face = "bold")
)
Looks like the multiplicative models follows the data more more closely in general.
# residuals
residuals_hw1 <- residuals(hw1_m)
residuals_hw2 <- residuals(hw2_m)
tsdisplay(residuals_hw1)
tsdisplay(residuals_hw2)
# Stationarity and Correlation
# check for stationarity of residuals
# additive
adf_test <- adf.test(residuals_hw1) # H0: series is non-stationary
print(adf_test) # if p-val < alpha, series not stationary
Augmented Dickey-Fuller Test
data: residuals_hw1
Dickey-Fuller = -1.8495, Lag order = 3, p-value = 0.6317
alternative hypothesis: stationary
# so with this model we achieve stationary series
# multiplicative
adf_test <- adf.test(residuals_hw2) # H0: series is non-stationary
print(adf_test) # if p-val < alpha, series not stationary
Augmented Dickey-Fuller Test
data: residuals_hw2
Dickey-Fuller = -2.0941, Lag order = 3, p-value = 0.5365
alternative hypothesis: stationary
# so with this model we achieve stationary series
# additive
# check for autocorrelation in residuals
Box.test(residuals_hw1, lag = 10, type = "Ljung-Box") # h0 res indep
Box-Ljung test
data: residuals_hw1
X-squared = 4.8498, df = 10, p-value = 0.901
# p-val > alpha => Dont reject h0, so residuals are indep
# additive
# check for autocorrelation in residuals
Box.test(residuals_hw2, lag = 10, type = "Ljung-Box") # h0 res indep
Box-Ljung test
data: residuals_hw2
X-squared = 10.493, df = 10, p-value = 0.3983
# p-val > alpha => Dont reject h0, so residuals are indep
Multiplicative model follows the data better and has slightly better residuals
# forecast
# save the forecast of the second model
forecast_hw1 <- forecast(hw1_m, h=12)
forecast_hw2 <- forecast(hw2_m, h=12)
# Forecast plot
# Plot the time series with both forecasts
autoplot(sales_m_ts) +
autolayer(forecast_hw1$mean, series="Additive Holt-Winters Forecast", PI=F) +
autolayer(forecast_hw2$mean, series="Multiplicative Holt-Winters Forecast", PI=F) +
ggtitle("Sales Forecast with Holt-Winters Models") +
xlab("Time") +
ylab("Sales") +
scale_color_manual(
values=c("Additive Holt-Winters Forecast" = "blue",
"Multiplicative Holt-Winters Forecast" = "red")
) +
theme_minimal() +
theme(legend.position = "top", legend.title = element_blank())
Warning: Ignoring unknown parameters: `PI`Warning: Ignoring unknown parameters: `PI`
# RMSE Calculation for Holt-Winters models
rmse_hw1 <- calculate_rmse(observed = sales_m_ts, predicted = fitted_hw1)
rmse_hw2 <- calculate_rmse(observed = sales_m_ts, predicted = fitted_hw2)
# Print RMSE values
cat("RMSE for Additive Holt-Winters Model:", rmse_hw1, "\n")
RMSE for Additive Holt-Winters Model: 14123520
cat("RMSE for Multiplicative Holt-Winters Model:", rmse_hw2, "\n")
RMSE for Multiplicative Holt-Winters Model: 13169921
Multiplicative model is better
The Holt winters model has a frequency limit of 24, so we cannot do larger than that. Weekly and daily data have 52 and 365 frequencies respectively so we cannot fit the model with the R implementation so far.
ARIMA is a acronym for Auto Regressive Integrated Moving Average, ARIMA (p,d,q) where p refers to the AR part, q refers to the MA part and d is the degree of first difference involved.
First we nwwd to check if the series is stationary
# see if series is stationary
adf.test(sales_m_ts) #H0, series is non-stationary
Augmented Dickey-Fuller Test
data: sales_m_ts
Dickey-Fuller = -2.655, Lag order = 3, p-value = 0.3183
alternative hypothesis: stationary
# p-val > 0.05 => dont reject, non stationary: series is not stationary
adf.test(diff(sales_m_ts)) #H0, series is non-stationary
Augmented Dickey-Fuller Test
data: diff(sales_m_ts)
Dickey-Fuller = -3.3757, Lag order = 3, p-value = 0.07725
alternative hypothesis: stationary
# see the acf and pacf
tsdisplay(diff(sales_m_ts))
plot(sales_m_ts)
ndiffs(sales_m_ts)
[1] 1
tsdisplay(diff(sales_m_ts))
Correlogram plot maybe suggests AR-1 or MA-1 after first difference
# ARIMA(p,d,q) = (1,1,0)
arima1_m<- Arima(sales_m_ts, order=c(1,1,0))
summary(arima1_m)
Series: sales_m_ts
ARIMA(1,1,0)
Coefficients:
ar1
-0.3178
s.e. 0.1783
sigma^2 = 2.666e+14: log likelihood = -630.5
AIC=1265 AICc=1265.38 BIC=1268.11
Training set error measures:
ME RMSE MAE MPE MAPE
Training set 4664178 15867282 12399192 6.652001 14.52265
MASE ACF1
Training set 0.3786368 -0.07165611
# study residual to see if is a good model
resid1_m<- residuals(arima1_m)
tsdisplay(resid1_m)
Residuals look stationary after fitting ARIMA
auto_arima_m <- auto.arima(sales_m_ts)
auto_arima_m
Series: sales_m_ts
ARIMA(0,1,1) with drift
Coefficients:
ma1 drift
-0.3741 3475637
s.e. 0.1604 1659940
sigma^2 = 2.494e+14: log likelihood = -628.83
AIC=1263.66 AICc=1264.44 BIC=1268.33
autoplot(forecast(auto_arima_m))
checkresiduals(auto_arima_m)
Ljung-Box test
data: Residuals from ARIMA(0,1,1) with drift
Q* = 0.42136, df = 6, p-value = 0.9987
Model df: 1. Total lags used: 7
The residuals of the Autoarima look stationary
AIC of the the manual arima is 1265, while the one of the autoarima is 1263. Lets use the autoarima
# Fitted values from both models
fitted_auto_arima <- fitted(auto_arima_m)
fitted_arima1 <- fitted(arima1_m)
# Create a data frame for plotting
plot_data <- data.frame(
Time = time(sales_m_ts),
Actual = as.numeric(sales_m_ts),
Fitted_Auto_ARIMA = as.numeric(fitted_auto_arima),
Fitted_ARIMA1 = as.numeric(fitted_arima1)
)
# Melt the data frame
plot_data_melted <- melt(plot_data, id.vars = "Time",
variable.name = "Series",
value.name = "Value")
# Plot
ggplot(plot_data_melted, aes(x = Time, y = Value, color = Series)) +
geom_point(data = subset(plot_data_melted, Series == "Actual"), size = 2) + # Actual values as points
geom_line(data = subset(plot_data_melted, Series != "Actual"), size = 1) + # Fitted values as lines
labs(
title = "Actual vs Fitted Values for ARIMA Models",
x = "Time",
y = "Sales",
color = "Series"
) +
scale_color_manual(
values = c("Actual" = "black", "Fitted_Auto_ARIMA" = "blue", "Fitted_ARIMA1" = "red"),
labels = c("Actual", "Fitted (Auto ARIMA)", "Fitted (ARIMA(1,1,0))")
) +
theme_minimal() +
theme(
legend.position = "top",
legend.title = element_blank()
)
NA
NA
# Calculate RMSE for the fitted values
# Calculate RMSE for each model
rmse_auto_arima <- calculate_rmse(observed = sales_m_ts, predicted = fitted_auto_arima)
rmse_arima1 <- calculate_rmse(observed = sales_m_ts, predicted = fitted_arima1)
# Print RMSE values
cat("RMSE for Auto ARIMA Model:", rmse_auto_arima, "\n")
RMSE for Auto ARIMA Model: 15118942
cat("RMSE for ARIMA(1,1,0) Model:", rmse_arima1, "\n")
RMSE for ARIMA(1,1,0) Model: 15867282
The RMSE of the Autoarima is better as is the AIC.
The ARIMA(0,1,1) model can be described simply as a random walk with drift. Here’s what that means:
AR (AutoRegressive) Part:
I (Integrated) Part:
MA (Moving Average) Part:
An ARIMA(0,1,1) model is suitable when:
The d=1 parameter in ARIMA(0,1,1) indicates that the series is differenced once to achieve stationarity. Before differencing, the series may exhibit a linear trend or random walk behavior. After differencing, the series should show no trend and have relatively stable mean and variance
The q=1 in ARIMA(0,1,1) indicates that the series is modeled with a first-order moving average component after differencing. The autocorrelation function (ACF) of the differenced series should show: A significant spike at lag 1. Rapid decay to zero after lag 1. The partial autocorrelation function (PACF) should show no significant lags.
# study residual to see if is a good model
resid_autoarima_m<- residuals(auto_arima_m)
tsdisplay(resid_autoarima_m)
# see if series is stationary
adf.test(sales_w_ts) #H0, series is non-stationary
Augmented Dickey-Fuller Test
data: sales_w_ts
Dickey-Fuller = -2.7572, Lag order = 5, p-value = 0.2607
alternative hypothesis: stationary
# p-val > 0.05 => dont reject, non stationary: series is not stationary
adf.test(diff(sales_w_ts)) # after diff is sationary
Warning: p-value smaller than printed p-value
Augmented Dickey-Fuller Test
data: diff(sales_w_ts)
Dickey-Fuller = -6.5801, Lag order = 5, p-value = 0.01
alternative hypothesis: stationary
After differencing, looks stationary
tsdisplay(diff(sales_w_ts))
Correlograms suggest maybe AR 1 or MA 1.
### Manual ARIMA------------
# ARIMA(p,d,q) = (1,1,0)
arima1_w<- Arima(sales_w_ts, order=c(1,1,0))
summary(arima1_w)
Series: sales_w_ts
ARIMA(1,1,0)
Coefficients:
ar1
-0.4393
s.e. 0.0742
sigma^2 = 1.161e+13: log likelihood = -2534.49
AIC=5072.98 AICc=5073.06 BIC=5079.06
Training set error measures:
ME RMSE MAE MPE MAPE
Training set 193180.2 3385012 2537492 -0.1816435 13.03938
MASE ACF1
Training set 0.3404022 -0.04968945
auto_arima_w <- auto.arima(sales_w_ts)
summary(auto_arima_w)
Series: sales_w_ts
ARIMA(0,1,1)
Coefficients:
ma1
-0.5426
s.e. 0.0758
sigma^2 = 1.122e+13: log likelihood = -2531.96
AIC=5067.91 AICc=5067.99 BIC=5073.99
Training set error measures:
ME RMSE MAE MPE MAPE
Training set 335675.7 3328293 2462721 0.6620165 12.38346
MASE ACF1
Training set 0.3303717 -0.01238358
AIC on the Autoarima is better, lets go with that one
checkresiduals(auto_arima_w)
Ljung-Box test
data: Residuals from ARIMA(0,1,1)
Q* = 17.171, df = 30, p-value = 0.9705
Model df: 1. Total lags used: 31
Residuals look stationary, see the plots for both models
# Fit ARIMA models for weekly data
arima1_w <- Arima(sales_w_ts, order = c(1, 1, 0))
auto_arima_w <- auto.arima(sales_w_ts)
# Extract fitted values for both models
fitted_arima1_w <- fitted(arima1_w)
fitted_auto_arima_w <- fitted(auto_arima_w)
# Create a data frame for plotting
plot_data <- data.frame(
Time = time(sales_w_ts),
Actual = as.numeric(sales_w_ts),
Fitted_ARIMA1 = as.numeric(fitted_arima1_w),
Fitted_Auto_ARIMA = as.numeric(fitted_auto_arima_w)
)
# Melt the data frame for ggplot2
plot_data_melted <- melt(plot_data, id.vars = "Time",
variable.name = "Series",
value.name = "Value")
# Plot using ggplot2
ggplot(plot_data_melted, aes(x = Time, y = Value, color = Series)) +
geom_point(data = subset(plot_data_melted, Series == "Actual"), size = 2) + # Actual values as points
geom_line(data = subset(plot_data_melted, Series != "Actual"), size = 1) + # Fitted values as lines
labs(
title = "Actual vs Fitted Values for ARIMA Models (Weekly)",
x = "Time",
y = "Sales",
color = "Series"
) +
scale_color_manual(
values = c("Actual" = "black", "Fitted_ARIMA1" = "red", "Fitted_Auto_ARIMA" = "blue"),
labels = c("Actual", "Fitted (ARIMA(1,1,0))", "Fitted (Auto ARIMA)")
) +
theme_minimal() +
theme(
legend.position = "top",
legend.title = element_blank()
)
# Calculate RMSE for both models
rmse_arima1_w <- calculate_rmse(observed = sales_w_ts, predicted = fitted_arima1_w)
rmse_auto_arima_w <- calculate_rmse(observed = sales_w_ts, predicted = fitted_auto_arima_w)
# Print RMSE values
cat("RMSE for ARIMA(1,1,0) Model (Weekly):", rmse_arima1_w, "\n")
RMSE for ARIMA(1,1,0) Model (Weekly): 3385012
cat("RMSE for Auto ARIMA Model (Weekly):", rmse_auto_arima_w, "\n")
RMSE for Auto ARIMA Model (Weekly): 3328293
The Auto-arima is also better in terms of RMSE
# see if series is stationary
adf.test(sales_d_ts) #H0, series is non-stationary
Warning: p-value smaller than printed p-value
Augmented Dickey-Fuller Test
data: sales_d_ts
Dickey-Fuller = -5.6283, Lag order = 10, p-value = 0.01
alternative hypothesis: stationary
# p-val < 0.05 => reject non stationary: series might be stationary
No need for differencing because is already stationary, try to model with arima
tsdisplay(sales_d_ts)
Autocorrelograms are not easy to interpret, but lets try with a baseline model
# ARIMA(p,d,q) = (2,1,0)
arima1_d<- Arima(sales_d_ts, order=c(1,0,1))
summary(arima1_d)
Series: sales_d_ts
ARIMA(1,0,1) with non-zero mean
Coefficients:
ar1 ma1 mean
0.3253 0.3666 3320965.47
s.e. 0.0434 0.0400 92676.64
sigma^2 = 2.184e+12: log likelihood = -16248.67
AIC=32505.35 AICc=32505.39 BIC=32525.14
Training set error measures:
ME RMSE MAE MPE MAPE
Training set 545.5599 1475571 1180971 -110.111 132.9826
MASE ACF1
Training set 0.5165637 0.0150054
checkresiduals(arima1_d)
Ljung-Box test
data: Residuals from ARIMA(1,0,1) with non-zero mean
Q* = 2743.8, df = 206, p-value < 2.2e-16
Model df: 2. Total lags used: 208
Residuals look not entirely stationary
Try to model with automatic approach:
auto_arima_d <- auto.arima(sales_d_ts)
summary(auto_arima_d)
Series: sales_d_ts
ARIMA(5,1,5) with drift
Coefficients:
ar1 ar2 ar3 ar4 ar5 ma1 ma2
0.9544 -1.5459 1.0021 -1.0850 0.1718 -1.6378 1.8605
s.e. 0.0587 0.0493 0.0822 0.0449 0.0505 0.0495 0.0774
ma3 ma4 ma5 drift
-1.7958 1.3246 -0.6504 3652.866
s.e. 0.0845 0.0733 0.0363 2319.631
sigma^2 = 1.213e+12: log likelihood = -15926.04
AIC=31876.08 AICc=31876.38 BIC=31935.43
Training set error measures:
ME RMSE MAE MPE MAPE
Training set -1608.767 1094980 810378.5 -49.07305 67.93344
MASE ACF1
Training set 0.3544643 0.003023956
checkresiduals(auto_arima_d)
Ljung-Box test
data: Residuals from ARIMA(5,1,5) with drift
Q* = 189.3, df = 198, p-value = 0.6592
Model df: 10. Total lags used: 208
Rresiduals improve, and AIC is lower in the autoarima Check the fit for both models
# Extract fitted values for both models
fitted_arima1_d <- fitted(arima1_d)
fitted_auto_arima_d <- fitted(auto_arima_d)
# Create a data frame for plotting
plot_data <- data.frame(
Time = time(sales_d_ts),
Actual = as.numeric(sales_d_ts),
Fitted_ARIMA1 = as.numeric(fitted_arima1_d),
Fitted_Auto_ARIMA = as.numeric(fitted_auto_arima_d)
)
# Melt the data frame for ggplot2
plot_data_melted <- melt(plot_data, id.vars = "Time",
variable.name = "Series",
value.name = "Value")
# Plot using ggplot2
ggplot(plot_data_melted, aes(x = Time, y = Value, color = Series)) +
geom_point(data = subset(plot_data_melted, Series == "Actual"), size = 2) + # Actual values as points
geom_line(data = subset(plot_data_melted, Series != "Actual"), size = 1) + # Fitted values as lines
labs(
title = "Actual vs Fitted Values for ARIMA Models (Daily)",
x = "Time",
y = "Sales",
color = "Series"
) +
scale_color_manual(
values = c("Actual" = "black", "Fitted_ARIMA1" = "red", "Fitted_Auto_ARIMA" = "blue"),
labels = c("Actual", "Fitted (ARIMA(1,0,1))", "Fitted (Auto ARIMA)")
) +
theme_minimal() +
theme(
legend.position = "top",
legend.title = element_blank()
)
Plot is not readable, but check the RMSE for both models to confirm wihch fits better
# Calculate RMSE for both models
rmse_arima1_d <- calculate_rmse(observed = sales_d_ts, predicted = fitted_arima1_d)
rmse_auto_arima_d <- calculate_rmse(observed = sales_d_ts, predicted = fitted_auto_arima_d)
# Print RMSE values
cat("RMSE for ARIMA(1,0,1) Model (Daily):", rmse_arima1_d, "\n")
RMSE for ARIMA(1,0,1) Model (Daily): 1475571
cat("RMSE for Auto ARIMA Model (Daily):", rmse_auto_arima_d, "\n")
RMSE for Auto ARIMA Model (Daily): 1094980
Autoarima is much better, now try to improve with seasonality, beacuse daily data looks seasonal each 7 days.
# Daily sales
tsdisplay(sales_d_ts) #
tsdisplay(diff(sales_d_ts))
sarima_d <- auto.arima(sales_d_ts, seasonal=TRUE)
summary(sarima_d)
Series: sales_d_ts
ARIMA(5,1,5) with drift
Coefficients:
ar1 ar2 ar3 ar4 ar5 ma1 ma2
0.9544 -1.5459 1.0021 -1.0850 0.1718 -1.6378 1.8605
s.e. 0.0587 0.0493 0.0822 0.0449 0.0505 0.0495 0.0774
ma3 ma4 ma5 drift
-1.7958 1.3246 -0.6504 3652.866
s.e. 0.0845 0.0733 0.0363 2319.631
sigma^2 = 1.213e+12: log likelihood = -15926.04
AIC=31876.08 AICc=31876.38 BIC=31935.43
Training set error measures:
ME RMSE MAE MPE MAPE
Training set -1608.767 1094980 810378.5 -49.07305 67.93344
MASE ACF1
Training set 0.3544643 0.003023956
resid_ds<- residuals(sarima_d)
tsdisplay(resid_ds)
# check for autocorrelation
Box.test(residuals(sarima_d), type="Ljung-Box")
Box-Ljung test
data: residuals(sarima_d)
X-squared = 0.0095375, df = 1, p-value = 0.9222
# A low p-value (<0.05) suggests residual autocorrelation.
Looks like Aarima is the same in terms of AIC, lets check the RMSE:
# Extract fitted values for both models
fitted_sarima_d <- fitted(sarima_d)
# Calculate RMSE for both models
rmse_sarima_d <- calculate_rmse(observed = sales_d_ts, predicted = fitted_sarima_d)
# Print RMSE values
cat("RMSE for Auto ARIMA Model (Daily):", rmse_auto_arima_d, "\n")
RMSE for Auto ARIMA Model (Daily): 1094980
cat("RMSE for Seasonal ARIMA Model (Daily):", rmse_sarima_d, "\n")
RMSE for Seasonal ARIMA Model (Daily): 1094980
The RMSE is exactly the same, they are the same model.
Refine SARIMA with external regressors
# readefine sales_d_ts
head(df_merged_d)
sales_d_ts <- ts(exp(df_merged_d$sales_cop), frequency=365, start=c(2021, 334)) # 334 is November 30
seasonal_sales_d_ts <- ts(exp(df_merged_d$sales_cop), frequency=7, start=c(2021, 334)) # 334 is November 30
plot(sales_d_ts)
tsdisplay(sales_d_ts,lag.max = 30)
tsdisplay(seasonal_sales_d_ts,lag.max = 30)
# define regresors
# Select specific columns by name
x_regressors_d <- df_merged_d %>% select(rain_sum, fx, tmedian)
# Apply the exponential function to each column
x_regressors_d <- as.data.frame(apply(x_regressors_d, 2, exp))
# Convert to a matrix for ARIMA modeling
x_regressors_d <- as.matrix(x_regressors_d)
# fit the model on sales
# Fit an auto.arima model with seasonal component and external regressors
sarimax_model_d <- auto.arima(
sales_d_ts,
seasonal = TRUE, # Enable seasonal components
xreg = x_regressors_d # External regressors
)
# Display the summary of the fitted model
summary(sarimax_model_d)
Series: sales_d_ts
Regression with ARIMA(5,1,5) errors
Coefficients:
ar1 ar2 ar3 ar4 ar5 ma1 ma2
0.9230 -1.5215 0.9550 -1.0569 0.1394 -1.6554 1.8823
s.e. 0.0605 0.0502 0.0831 0.0448 0.0509 0.0516 0.0805
ma3 ma4 ma5 drift rain_sum fx
-1.8168 1.3447 -0.6535 3880.584 38.8494 -375.8824
s.e. 0.0836 0.0730 0.0373 2346.036 112.5507 383.7332
tmedian
62715.8
s.e. 30430.2
sigma^2 = 1.333e+12: log likelihood = -15973.62
AIC=31977.25 AICc=31977.72 BIC=32051.44
Training set error measures:
ME RMSE MAE MPE MAPE MASE
Training set -2264.148 1146332 833227 -3414423 3414442 0.3598772
ACF1
Training set 0.000552024
The AIC actually decreases, lets check the RMSE
# Extract fitted values for all models
fitted_sarimax_d <- fitted(sarimax_model_d)
# Calculate RMSE for all models
rmse_sarimax_d <- calculate_rmse(observed = sales_d_ts, predicted = fitted_sarimax_d)
# Print RMSE values
cat("RMSE for Auto ARIMA Model (Daily):", rmse_auto_arima_d, "\n")
RMSE for Auto ARIMA Model (Daily): 1094980
cat("RMSE for Seasonal ARIMA Model (Daily):", rmse_sarima_d, "\n")
RMSE for Seasonal ARIMA Model (Daily): 1094980
cat("RMSE for SARIMAX Model (Daily):", rmse_sarimax_d, "\n")
RMSE for SARIMAX Model (Daily): 1146332
The RMSE also worsens, so stay with regular Auto-ARIMA
Monthly Sales
fit_m1 <- ses(sales_m_ts, alpha = 0.2, initial = 'simple', h=5)
fit_m2 <- ses(sales_m_ts, alpha = 0.6, initial = 'simple', h=5)
fit_m3 <- ses(sales_m_ts, h=5)
plot(sales_m_ts, ylab='Monthly Sales', xlab='Months')
lines(fitted(fit_m1), col='blue', type='o')
lines(fitted(fit_m2), col='red', type='o')
lines(fitted(fit_m3), col='green', type='o')
forecast_m1 <- ses(sales_m_ts, h=5)
# Accuracy of one-step-ahead training errors
round(accuracy(forecast_m1),2)
ME RMSE MAE MPE MAPE MASE ACF1
Training set 3207582 17626876 13250973 -12.1 29.7 0.4 -0.05
summary(forecast_m1)
Forecast method: Simple exponential smoothing
Model Information:
Simple exponential smoothing
Call:
ses(y = sales_m_ts, h = 5)
Smoothing parameters:
alpha = 0.675
Initial states:
l = 56978123.3871
sigma: 18137906
AIC AICc BIC
1336.322 1337.072 1341.073
Error measures:
ME RMSE MAE MPE MAPE
Training set 3207582 17626876 13250973 -12.09656 29.70395
MASE ACF1
Training set 0.4046478 -0.0504105
Forecasts:
autoplot(forecast_m1) + autolayer(fitted(forecast_m1),series='Fitted') + ylab("Monthly Sales")+xlab("Months")
# Extract fitted values for each model
fitted_m1 <- fitted(fit_m1)
fitted_m2 <- fitted(fit_m2)
fitted_m3 <- fitted(fit_m3)
# Calculate RMSE for each model
rmse_m1 <- calculate_rmse(observed = sales_m_ts, predicted = fitted_m1)
rmse_m2 <- calculate_rmse(observed = sales_m_ts, predicted = fitted_m2)
rmse_m3 <- calculate_rmse(observed = sales_m_ts, predicted = fitted_m3)
# Print RMSE values
cat("RMSE for SES Model 1 (alpha = 0.2):", rmse_m1, "\n")
RMSE for SES Model 1 (alpha = 0.2): 23924620
cat("RMSE for SES Model 2 (alpha = 0.6):", rmse_m2, "\n")
RMSE for SES Model 2 (alpha = 0.6): 16189623
cat("RMSE for SES Model 3 (Optimized alpha):", rmse_m3, "\n")
RMSE for SES Model 3 (Optimized alpha): 17626876
rmse_exp_sm_m <- rmse_m2
Weekly Sales
For weekly data, exponential smoothing can capture longer-term trends and seasonal patterns that repeat on a weekly basis. Weekly data can also have seasonal components related to months, quarters, or years.
fit_w1 <- ses(sales_w_ts, alpha = 0.2, initial = 'simple', h=5)
fit_w2 <- ses(sales_w_ts, alpha = 0.6, initial = 'simple', h=5)
fit_w3 <- ses(sales_w_ts, h=5)
plot(sales_w_ts, ylab='Weekly Sales', xlab='Weeks')
lines(fitted(fit_w1), col='blue', type='o')
lines(fitted(fit_w2), col='red', type='o')
lines(fitted(fit_w3), col='green', type='o')
forecast_w1 <- ses(sales_w_ts, h=5)
round(accuracy(forecast_w1),2)
ME RMSE MAE MPE MAPE MASE ACF1
Training set 306073.8 3331847 2478469 -0.46 13.08 0.33 -0.01
summary(forecast_w1)
Forecast method: Simple exponential smoothing
Model Information:
Simple exponential smoothing
Call:
ses(y = sales_w_ts, h = 5)
Smoothing parameters:
alpha = 0.4502
Initial states:
l = 5339607.7196
sigma: 3353553
AIC AICc BIC
5443.633 5443.791 5452.763
Error measures:
ME RMSE MAE MPE MAPE
Training set 306073.8 3331847 2478469 -0.4636447 13.08087
MASE ACF1
Training set 0.3324843 -0.006458446
Forecasts:
autoplot(forecast_w1) + autolayer(fitted(forecast_w1),series='Fitted') + ylab("Weekly Sales")+xlab("Weeks")
# Extract fitted values for each model
fitted_w1 <- fitted(fit_w1)
fitted_w2 <- fitted(fit_w2)
fitted_w3 <- fitted(fit_w3)
# Calculate RMSE for each model
rmse_w1 <- calculate_rmse(observed = sales_w_ts, predicted = fitted_w1)
rmse_w2 <- calculate_rmse(observed = sales_w_ts, predicted = fitted_w2)
rmse_w3 <- calculate_rmse(observed = sales_w_ts, predicted = fitted_w3)
# Print RMSE values
cat("RMSE for SES Model 1 (alpha = 0.2):", rmse_w1, "\n")
RMSE for SES Model 1 (alpha = 0.2): 3564863
cat("RMSE for SES Model 2 (alpha = 0.6):", rmse_w2, "\n")
RMSE for SES Model 2 (alpha = 0.6): 3363852
cat("RMSE for SES Model 3 (Optimized alpha):", rmse_w3, "\n")
RMSE for SES Model 3 (Optimized alpha): 3331847
rmse_exp_sm_w <- rmse_w3
Daily Sales
For daily data, exponential smoothing can be used to forecast short-term trends and seasonal patterns. When applying exponential smoothing to daily data, you need to consider:
Seasonality: Daily data often exhibit seasonal patterns, such as weekly cycles (e.g., higher sales on weekends).
Holidays and special events: These can cause irregular patterns in daily data that may need to be accounted for.
fit_d1 <- ses(sales_d_ts, alpha = 0.2, initial = 'simple', h=5)
fit_d2 <- ses(sales_d_ts, alpha = 0.6, initial = 'simple', h=5)
fit_d3 <- ses(sales_d_ts, h=5)
plot(sales_d_ts, ylab='Daily Sales', xlab='Days')
lines(fitted(fit_d1), col='blue', type='o')
lines(fitted(fit_d2), col='red', type='o')
lines(fitted(fit_d3), col='green', type='o')
forecast_d1 <- ses(sales_d_ts, h=5)
round(accuracy(forecast_d1),2)
ME RMSE MAE MPE MAPE MASE ACF1
Training set 91166.44 1642520 1279475 -3521871 3521898 0.55 0.38
summary(forecast_d1)
Forecast method: Simple exponential smoothing
Model Information:
Simple exponential smoothing
Call:
ses(y = sales_d_ts, h = 5)
Smoothing parameters:
alpha = 0.0372
Initial states:
l = 863041.8751
sigma: 1644102
AIC AICc BIC
36999.28 36999.30 37014.12
Error measures:
ME RMSE MAE MPE MAPE MASE
Training set 91166.44 1642520 1279475 -3521871 3521898 0.5526151
ACF1
Training set 0.3755302
Forecasts:
autoplot(forecast_d1) + autolayer(fitted(forecast_d1),series='Fitted') + ylab("Daily Sales")+xlab("Days")
# Extract fitted values for each model
fitted_d1 <- fitted(fit_d1)
fitted_d2 <- fitted(fit_d2)
fitted_d3 <- fitted(fit_d3)
# Calculate RMSE for each model
rmse_d1 <- calculate_rmse(observed = sales_d_ts, predicted = fitted_d1)
rmse_d2 <- calculate_rmse(observed = sales_d_ts, predicted = fitted_d2)
rmse_d3 <- calculate_rmse(observed = sales_d_ts, predicted = fitted_d3)
# Print RMSE values
cat("RMSE for SES Model 1 (alpha = 0.2):", rmse_d1, "\n")
RMSE for SES Model 1 (alpha = 0.2): 1714231
cat("RMSE for SES Model 2 (alpha = 0.6):", rmse_d2, "\n")
RMSE for SES Model 2 (alpha = 0.6): 1803005
cat("RMSE for SES Model 3 (Optimized alpha):", rmse_d3, "\n")
RMSE for SES Model 3 (Optimized alpha): 1642520
rmse_exp_sm_d <- rmse_d3
# 1. GGM on sales_m_ts
summary(ggm1)
Call: ( Guseo Guidolin Model )
GGM(series = sales_m_ts, mt = "base", display = T)
Residuals:
Min. 1st Qu. Median Mean 3rd Qu. Max.
-27544719 -8809035 742251 -337148 7701034 23238738
Coefficients:
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error 13318300 on 31 degrees of freedom
Multiple R-squared: 0.999873 Residual sum of squares: 5.498691e+15
# 2. Get the instantaneous fitted values of the GGM
fitted_ggm_cum <- fitted(ggm1)
fitted_ggm_instant <- make.instantaneous(fitted_ggm_cum)
# 3. Auto-SARIMAX on cumsum(sales_m_ts), with xreg = instantaneous GGM
# First, create the cumulative sales
cumulative_sales_m_ts <- cumsum(sales_m_ts)
sarima_m <- auto.arima(
cumulative_sales_m_ts,
xreg = fitted_ggm_instant,
seasonal = TRUE,
stepwise = FALSE,
approximation= FALSE
)
# 4. Extract the fitted (cumulative) values of the SARIMAX
fitted_cum_sarima <- fitted(sarima_m)
# 5. Convert SARIMAX fitted cumulative back to instantaneous
# One simple approach:
fitted_instant_sarima <- diff(c(0, fitted_cum_sarima))
# Now fitted_instant_sarima should have the same length as sales_m_ts.
# Turn it into a time series object for neat plotting
fitted_instant_sarima_ts <- ts(
fitted_instant_sarima,
start = start(sales_m_ts),
frequency = frequency(sales_m_ts)
)
# 6. Plot the actual monthly sales and the SARIMAX instantaneous fit
plot(
sales_m_ts,
type="o",
col="black",
lwd=2,
main="Actual Monthly Sales vs. SARIMAX Fitted (Instantaneous)",
xlab="Time",
ylab="Monthly Sales"
)
lines(
fitted_instant_sarima_ts,
col="red",
lty=2,
lwd=2
)
legend(
"bottomright",
legend = c("Actual Sales", "SARIMAX Fit (Instantaneous)"),
col = c("black", "red"),
lty = c(1, 2),
lwd = c(2, 2)
)
# Calculate RMSE for fitted_instantaneous_ts against sales_m_ts
rmse_mixture_m <- calculate_rmse(observed = sales_m_ts, predicted = fitted_instant_sarima_ts)
# Print the RMSE value
cat("RMSE for Fitted Instantaneous Values (GGM + SARIMAX):", rmse_mixture_m, "\n")
RMSE for Fitted Instantaneous Values (GGM + SARIMAX): 20987648
resid_mixture_m <- sales_m_ts - fitted_instant_sarima_ts
tsdisplay(resid_mixture_m)
Residuals have autocorrelation at lag 1
# 1. GGM on sales_w_ts
summary(ggm1_w)
Call: ( Guseo Guidolin Model )
GGM(series = sales_w_ts, mt = "base", display = T)
Residuals:
Min. 1st Qu. Median Mean 3rd Qu. Max.
-30941134 -8439496 1818273 88704 8469414 21113528
Coefficients:
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error 12271825 on 150 degrees of freedom
Multiple R-squared: 0.999876 Residual sum of squares: 2.258965e+16
# 2. Get the instantaneous fitted values of the GGM
fitted_ggm_cum <- fitted(ggm1_w) # Cumulative fit from GGM
fitted_ggm_instant <- make.instantaneous(fitted_ggm_cum) # Convert to weekly flow
# 3. Auto-SARIMAX on cumsum(sales_w_ts), with xreg = instantaneous GGM
# Create the cumulative weekly series
cumulative_sales_w_ts <- cumsum(sales_w_ts)
# Fit an auto.arima with the GGM instantaneous series as xreg
sarima_w <- auto.arima(
cumulative_sales_w_ts,
xreg = fitted_ggm_instant,
seasonal = TRUE,
stepwise = FALSE,
approximation= FALSE
)
# 4. Extract the fitted (cumulative) values of the SARIMAX
fitted_cum_sarima <- fitted(sarima_w)
# This is the model's in-sample one-step-ahead *cumulative* fit.
# 5. Convert SARIMAX fitted cumulative back to instantaneous
fitted_instant_sarima <- diff(c(0, fitted_cum_sarima))
# Turn it into a time series object
fitted_instant_sarima_ts <- ts(
fitted_instant_sarima,
start = start(sales_w_ts),
frequency = frequency(sales_w_ts)
)
# 6. Plot the actual weekly sales and the SARIMAX instantaneous fit
plot(
sales_w_ts,
type = "o",
col = "black",
lwd = 2,
main = "Actual Weekly Sales vs. SARIMAX Fitted (Instantaneous)",
xlab = "Time (Weeks)",
ylab = "Weekly Sales"
)
lines(
fitted_instant_sarima_ts,
col = "red",
lty = 2,
lwd = 2
)
legend(
"bottomright",
legend = c("Actual Weekly Sales", "SARIMAX Fit (Instantaneous)"),
col = c("black", "red"),
lty = c(1, 2),
lwd = c(2, 2)
)
# Residuals
# Step 1: Extract residuals from the SARIMA model
resid_w <- residuals(sarima_w)
# Step 2: Visualize residuals
# Time series plot of residuals
tsdisplay(resid_w, main = "Residual Diagnostics for SARIMA Model")
# Step 3: Test residuals for stationarity
adf_test <- adf.test(resid_w)
Warning: p-value smaller than printed p-value
cat("ADF Test p-value:", adf_test$p.value, "\n")
ADF Test p-value: 0.01
if (adf_test$p.value < 0.05) {
cat("The residuals are stationary.\n")
} else {
cat("The residuals are not stationary.\n")
}
The residuals are stationary.
# Step 4: Test residuals for white noise (no autocorrelation)
ljung_box_test <- Box.test(resid_w, lag = 20, type = "Ljung-Box")
cat("Ljung-Box Test p-value:", ljung_box_test$p.value, "\n")
Ljung-Box Test p-value: 0.8993948
if (ljung_box_test$p.value > 0.05) {
cat("The residuals resemble white noise (uncorrelated).\n")
} else {
cat("The residuals show significant autocorrelation.\n")
}
The residuals resemble white noise (uncorrelated).
Stationary residuals but with significant correlation
#### RMSE for SARIMAX Predictions ####
rmse_mixture_w <- calculate_rmse(observed = sales_w_ts, predicted = fitted_instant_sarima_ts)
# Print RMSE for SARIMAX
cat("RMSE for SARIMAX Predictions:", rmse_mixture_w, "\n")
RMSE for SARIMAX Predictions: 4567633
# 1. GGM on sales_d_ts
summary(ggm1_d)
Call: ( Guseo Guidolin Model )
GGM(series = sales_scaled, mt = "base", display = T)
Residuals:
Min. 1st Qu. Median Mean 3rd Qu. Max.
-3.40144 -0.89220 0.18363 0.04369 0.98854 2.48411
Coefficients:
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error 1.202014 on 1035 degrees of freedom
Multiple R-squared: 0.999862 Residual sum of squares: 1495.407
# 2. Get the instantaneous fitted values of the GGM
fitted_ggm_cum <- fitted(ggm1_d) # Cumulative fit from GGM
fitted_ggm_instant <- make.instantaneous(fitted_ggm_cum) # Convert to daily flow
# 3. Auto-SARIMAX on cumsum(sales_d_ts), with xreg = instantaneous GGM
# Create the cumulative daily series
cumulative_sales_d_ts <- cumsum(sales_d_ts)
# first scale data so it can converge
cumulative_sales_d_ts <- scale(cumulative_sales_d_ts)
fitted_ggm_instant <- scale(fitted_ggm_instant)
# Scale the data
cumulative_sales_center <- attr(cumulative_sales_d_ts, "scaled:center")
cumulative_sales_scale <- attr(cumulative_sales_d_ts, "scaled:scale")
ggm_instant_center <- attr(fitted_ggm_instant, "scaled:center")
ggm_instant_scale <- attr(fitted_ggm_instant, "scaled:scale")
sarima_d <- auto.arima(
cumulative_sales_d_ts,
xreg = fitted_ggm_instant,
stepwise = TRUE,
approximation = TRUE
)
# 4. Extract the fitted (cumulative) values of the SARIMAX
# Fitted cumulative values (scaled)
fitted_cum_sarima <- fitted(sarima_d)
# Scale back to original cumulative values
fitted_cum_sarima_original <- fitted_cum_sarima * cumulative_sales_scale + cumulative_sales_center
# Convert cumulative fitted to instantaneous fitted (original units)
fitted_instant_sarima_original <- diff(c(0, fitted_cum_sarima_original))
# Create time series object for instantaneous fitted values
fitted_instant_sarima_original_ts <- ts(
fitted_instant_sarima_original,
start = start(sales_d_ts),
frequency = frequency(sales_d_ts)
)
# Plot the results
plot(
sales_d_ts,
type = "o",
col = "black",
lwd = 2,
main = "Actual Daily Sales vs. SARIMAX Fitted (Instantaneous)",
xlab = "Time (Days)",
ylab = "Daily Sales"
)
lines(
fitted_instant_sarima_original_ts,
col = "red",
lty = 2,
lwd = 2
)
legend(
"bottomright",
legend = c("Actual Daily Sales", "SARIMAX Fit (Instantaneous)"),
col = c("black", "red"),
lty = c(1, 2),
lwd = c(2, 2)
)
#### Residuals-----------------------
# Extract residuals from the SARIMA model
resid_d <- residuals(sarima_d)
# Visualize residuals
tsdisplay(resid_d, main = "Residual Diagnostics for SARIMA Model")
# Test residuals for stationarity
adf_test <- adf.test(resid_d)
Warning: p-value smaller than printed p-value
cat("ADF Test p-value:", adf_test$p.value, "\n")
ADF Test p-value: 0.01
if (adf_test$p.value < 0.05) {
cat("The residuals are stationary.\n")
} else {
cat("The residuals are not stationary.\n")
}
The residuals are stationary.
# Test residuals for white noise (no autocorrelation)
ljung_box_test <- Box.test(resid_d, lag = 20, type = "Ljung-Box")
cat("Ljung-Box Test p-value:", ljung_box_test$p.value, "\n")
Ljung-Box Test p-value: 0.5910185
if (ljung_box_test$p.value > 0.05) {
cat("The residuals resemble white noise (uncorrelated).\n")
} else {
cat("The residuals show significant autocorrelation.\n")
}
The residuals resemble white noise (uncorrelated).
#### RMSE for SARIMAX Predictions ####
rmse_mixture_d <- calculate_rmse(observed = sales_d_ts, predicted = fitted_instant_sarima_original_ts)
# Print RMSE for SARIMAX
cat("RMSE for SARIMAX Predictions:", rmse_mixture_d, "\n")
RMSE for SARIMAX Predictions: 1629664
This model was introduced by Facebook (S. J. Taylor & Letham, 2018), originally for forecasting daily data with weekly and yearly seasonality, plus holiday effects. It was later extended to cover more types of seasonal data. It works best with time series that have strong seasonality and several seasons of historical data.
Prophet can be considered a nonlinear regression model (Chapter 7), of the form yt=g(t)+s(t)+h(t)+εt, where g(t) describes a piecewise-linear trend (or “growth term”), s(t) describes the various seasonal patterns, h(t) captures the holiday effects, and εt is a white noise error term.
The knots (or changepoints) for the piecewise-linear trend are automatically selected if not explicitly specified. Optionally, a logistic function can be used to set an upper bound on the trend.
The seasonal component consists of Fourier terms of the relevant periods. By default, order 10 is used for annual seasonality and order 3 is used for weekly seasonality.
Holiday effects are added as simple dummy variables.
The model is estimated using a Bayesian approach to allow for automatic selection of the changepoints and other model characteristics.
library(prophet)
The input to Prophet is always a dataframe with two columns: ds and y . The ds (datestamp) column should be of a format, ideally YYYY-MM-DD for a date or YYYY-MM-DD HH:MM:SS for a timestamp. The y column must be numeric, and represents the measurement we wish to forecast.
# sales montly
ggplot(df_merged_m, aes(x=month, y=sales_m)) +
geom_line() + ggtitle("Monthly Sales of Restaurant")
head(df_merged_m)
#Prophet model
# model with no seasonality
df_prophet_m <- df_merged_m[1:2]
head(df_prophet_m)
colnames(df_prophet_m) = c("ds", "y")
df_prophet_m$y <- exp(df_prophet_m$y)
prophet_sales_m <- prophet(df_prophet_m)
Disabling weekly seasonality. Run prophet with weekly.seasonality=TRUE to override this.
Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
head(df_prophet_m)
# Step 2: Create a future dataframe for the next 14 months
future_sales_m <- make_future_dataframe(
prophet_sales_m,
periods = 14, # Forecast for 14 months
freq = 'month', # Monthly frequency
include_history = TRUE # Include historical data in the future dataframe
)
tail(future_sales_m)
forecast_sales_m <- predict(prophet_sales_m, future_sales_m)
tail(forecast_sales_m[c('ds', 'yhat', 'yhat_lower', 'yhat_upper')])
plot(prophet_sales_m, forecast_sales_m)
prophet_plot_components(prophet_sales_m, forecast_sales_m)
dyplot.prophet(prophet_sales_m, forecast_sales_m)
Warning: `select_()` was deprecated in dplyr 0.7.0.
Please use `select()` instead.Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
#Use the original dataframe to get the fitted values
fitted_values <- predict(prophet_sales_m, df_prophet_m)
# Extract the fitted values (column 'yhat' contains the fitted values)
fitted_y <- fitted_values$yhat
# Calculate RMSE
actual_y <- df_prophet_m$y # Actual sales values
rmse_prophet_m <- calculate_rmse(observed = actual_y, predicted = fitted_y)
# Print RMSE
cat("RMSE for Prophet Fitted Values:", rmse_prophet_m, "\n")
RMSE for Prophet Fitted Values: 16786939
Residuals for prophet
# Calculate Residuals
residuals_prophet <- actual_y - fitted_y # Residuals = Actual - Fitted
# Visualize Residuals using tsdisplay
tsdisplay(residuals_prophet, main = "Residual Diagnostics for Prophet Model")
# Perform ADF Test for Stationarity
adf_test <- adf.test(residuals_prophet)
cat("ADF Test p-value:", adf_test$p.value, "\n")
ADF Test p-value: 0.3278912
if (adf_test$p.value < 0.05) {
cat("Residuals are stationary (reject H0).\n")
} else {
cat("Residuals are not stationary (fail to reject H0).\n")
}
Residuals are not stationary (fail to reject H0).
# Perform Serial Correlation Test
ljung_box_test <- Box.test(residuals_prophet, lag = 10, type = "Ljung-Box")
cat("Ljung-Box Test p-value:", ljung_box_test$p.value, "\n")
Ljung-Box Test p-value: 6.527107e-10
if (ljung_box_test$p.value > 0.05) {
cat("Residuals resemble white noise (no significant autocorrelation).\n")
} else {
cat("Residuals show significant autocorrelation.\n")
}
Residuals show significant autocorrelation.
ggplot(df_merged_w, aes(x=week, y=sales_w)) +
geom_line() + ggtitle("Weekly Sales of Restaurant")
head(df_merged_w)
#Prophet model
# model with no seasonality
df_prophet_w <- df_merged_w[1:2]
colnames(df_prophet_w) = c("ds", "y")
df_prophet_w$y <- exp(df_prophet_w$y)
df_prophet_w
prophet_sales_w <- prophet(df_prophet_w)
Disabling weekly seasonality. Run prophet with weekly.seasonality=TRUE to override this.
Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
Predictions are made on a dataframe with a column ds
containing the dates for which predictions are to be made. The
make_future_dataframe function takes the model object and a
number of periods to forecast and produces a suitable dataframe. By
default it will also include the historical dates so we can evaluate
in-sample fit.
future_sales_w <- make_future_dataframe(prophet_sales_w,
periods = 52,
freq = 'week',
include_history = T)
tail(future_sales_w)
As with most modeling procedures in R, we use the generic
predict function to get our forecast. The
forecast object is a dataframe with a column
yhat containing the forecast. It has additional columns for
uncertainty intervals and seasonal components.
# R
forecast_sales_w <- predict(prophet_sales_w, future_sales_w)
tail(forecast_sales_w[c('ds', 'yhat', 'yhat_lower', 'yhat_upper')])
plot(prophet_sales_w, forecast_sales_w)
You can use the prophet_plot_components function to see
the forecast broken down into trend, weekly seasonality, and yearly
seasonality.
prophet_plot_components(prophet_sales_w, forecast_sales_w)
dyplot.prophet(prophet_sales_w, forecast_sales_w)
# Use the original dataset to get fitted values
fitted_values_w <- predict(prophet_sales_w, df_prophet_w)
# Extract the fitted values (column 'yhat' contains the fitted values)
fitted_y_w <- fitted_values_w$yhat
# Ensure alignment between actual values (y) and fitted values (yhat)
actual_y_w <- df_prophet_w$y # Actual weekly sales values
# Calculate RMSE for weekly data
rmse_prophet_w <- calculate_rmse(observed = actual_y_w, predicted = fitted_y_w)
# Print RMSE
cat("RMSE for Prophet Fitted Values (Weekly):", rmse_prophet_w, "\n")
RMSE for Prophet Fitted Values (Weekly): 3246088
# Calculate Residuals
residuals_prophet_w <- actual_y_w - fitted_y_w # Residuals = Actual - Fitted
# Visualize Residuals using tsdisplay
tsdisplay(residuals_prophet_w, main = "Residual Diagnostics for Weekly Prophet Model")
# Perform ADF Test for Stationarity
adf_test_w <- adf.test(residuals_prophet_w)
cat("ADF Test p-value:", adf_test_w$p.value, "\n")
ADF Test p-value: 0.0238759
if (adf_test_w$p.value < 0.05) {
cat("Residuals are stationary (reject H0).\n")
} else {
cat("Residuals are not stationary (fail to reject H0).\n")
}
Residuals are stationary (reject H0).
# Perform Serial Correlation Test
ljung_box_test_w <- Box.test(residuals_prophet_w, lag = 10, type = "Ljung-Box")
cat("Ljung-Box Test p-value:", ljung_box_test_w$p.value, "\n")
Ljung-Box Test p-value: 5.447864e-11
if (ljung_box_test_w$p.value > 0.05) {
cat("Residuals resemble white noise (no significant autocorrelation).\n")
} else {
cat("Residuals show significant autocorrelation.\n")
}
Residuals show significant autocorrelation.
head(sales_d_ts)
Time Series:
Start = c(2021, 334)
End = c(2021, 339)
Frequency = 365
[1] 1493701 673701 1205301 1340901 1343701 360901
plot(sales_d_ts)
sales_d_values <- as.numeric(sales_d_ts) # Extract numeric values
df_prophet_d <- data.frame(
ds = df_merged_d$date, # Dates
y = sales_d_values # Sales values
)
#Prophet model
#prophet_sales_d <- prophet(df_prophet, weekly.seasonality = TRUE)
prophet_sales_d <- prophet(df_prophet_d)
Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
future_sales_d <- make_future_dataframe(prophet_sales_d,
periods = 120,
freq = 'day',
include_history = T)
tail(future_sales_d)
forecast_sales_d <- predict(prophet_sales_d, future_sales_d)
tail(forecast_sales_d[c('ds', 'yhat', 'yhat_lower', 'yhat_upper')])
plot(prophet_sales_d, forecast_sales_d)
prophet_plot_components(prophet_sales_d, forecast_sales_d)
dyplot.prophet(prophet_sales_d, forecast_sales_d)
# Extract fitted values for RMSE calculation
fitted_values_d <- predict(prophet_sales_d, df_prophet_d)
# Extract fitted values (column 'yhat')
fitted_y_d <- fitted_values_d$yhat
actual_y_d <- df_prophet_d$y # Actual sales values
# Step 8: Calculate RMSE
rmse_prophet_d <- calculate_rmse(observed = actual_y_d, predicted = fitted_y_d)
# Print RMSE
cat("RMSE for Prophet Fitted Values (Daily):", rmse_prophet_d, "\n")
RMSE for Prophet Fitted Values (Daily): 1071375
# Calculate Residuals
residuals_prophet_d <- actual_y_d - fitted_y_d # Residuals = Actual - Fitted
# Visualize Residuals using tsdisplay
tsdisplay(residuals_prophet_d, main = "Residual Diagnostics for Daily Prophet Model")
# Perform ADF Test for Stationarity
adf_test_d <- adf.test(residuals_prophet_d)
Warning: p-value smaller than printed p-value
cat("ADF Test p-value:", adf_test_d$p.value, "\n")
ADF Test p-value: 0.01
if (adf_test_d$p.value < 0.05) {
cat("Residuals are stationary (reject H0).\n")
} else {
cat("Residuals are not stationary (fail to reject H0).\n")
}
Residuals are stationary (reject H0).
# Perform Serial Correlation Test
ljung_box_test_d <- Box.test(residuals_prophet_d, lag = 20, type = "Ljung-Box")
cat("Ljung-Box Test p-value:", ljung_box_test_d$p.value, "\n")
Ljung-Box Test p-value: 0
if (ljung_box_test_d$p.value > 0.05) {
cat("Residuals resemble white noise (no significant autocorrelation).\n")
} else {
cat("Residuals show significant autocorrelation.\n")
}
Residuals show significant autocorrelation.
rmse_list <- c(rmse_ols_m, rmse_ols_w, rmse_ols_d,
rmse_bm_m, rmse_bm_w, rmse_bm_d,
rmse_ggm1, rmse_ggm_w, rmse_ggm_d,
rmse_hw2,
rmse_auto_arima, rmse_auto_arima_w, rmse_auto_arima_d,
rmse_sarima_d,
rmse_sarimax_d,
rmse_exp_sm_m, rmse_exp_sm_w, rmse_exp_sm_m,
rmse_mixture_m, rmse_mixture_w, rmse_mixture_d,
rmse_prophet_m, rmse_prophet_w, rmse_prophet_d
)
rmse_list
[1] 13229295 4000370 1421301 18498870 4542019 1651896
[7] 11759505 3453199 1600510 13169921 15118942 3328293
[13] 1094980 1094980 1146332 16189623 3331847 16189623
[19] 20987648 4567633 1629664 16786939 3246088 1071375
# Initialize an empty data frame for RMSE values
rmse_table <- data.frame(
Model = character(),
Monthly = numeric(),
Weekly = numeric(),
Daily = numeric(),
stringsAsFactors = FALSE
)
# Monthly RMSE values
rmse_monthly <- c(
"OLS" = rmse_ols_m,
"Bass_Model" = rmse_bm_m,
"GGM" = rmse_ggm1,
"Holt_Winters" = rmse_hw2,
"Arima" = rmse_auto_arima,
"Exp_Smooth" = rmse_exp_sm_m,
"GGM+SARIMA" = rmse_mixture_m,
"Prophet" = rmse_prophet_m
)
# Weekly RMSE values
rmse_weekly <- c(
"OLS" = rmse_ols_w,
"Bass_Model" = rmse_bm_w,
"GGM" = rmse_ggm_w,
"Holt_Winters" = NaN,
"Arima" = rmse_auto_arima_w,
"Exp_Smooth" = rmse_exp_sm_w,
"GGM+SARIMA" = rmse_mixture_w,
"Prophet" = rmse_prophet_w
)
# Daily RMSE values
rmse_daily <- c(
"OLS" = rmse_ols_d,
"Bass_Model" = rmse_bm_d,
"GGM" = rmse_ggm_d,
"Holt_Winters" = NaN,
"Arima" = rmse_auto_arima_d,
"Exp_Smooth" = rmse_exp_sm_d,
"GGM+SARIMA" = rmse_mixture_d,
"Prophet" = rmse_prophet_d
)
# Combine RMSE values into a table
for (model_name in names(rmse_monthly)) {
rmse_table <- rbind(rmse_table, data.frame(
Model = model_name,
Monthly = rmse_monthly[model_name],
Weekly = rmse_weekly[model_name],
Daily = rmse_daily[model_name]
))
}
# View the RMSE table
print(rmse_table)
NA
Best models are:
# target variable
test_sales_df <- read_excel("data/sales/test_data.xlsx")
head(test_sales_df)
df_sales_m_test <- test_sales_df %>%
mutate(month = floor_date(date, "month")) %>% # Extract month
group_by(month) %>%
summarise(sales_m = sum(sales_cop), bar_m = sum(bar), food_m = sum(food)
) # Summing values
head(df_sales_m_test)
## sales weekly
df_sales_w_test <- test_sales_df %>%
mutate(week = floor_date(date, "week")) %>% # Extract month
group_by(week) %>%
summarise(sales_w = sum(sales_cop), bar_w = sum(bar), food_w = sum(food)) # Summing values
head(df_sales_w_test)
# Length of historical data
N <- length(sales_m_ts)
# Time index for the next 38 periods
future_times <- matrix((N + 1):(N + 38), ncol = 1)
# Predict cumulative sales for the next 38 periods
ggm_future_cum <- predict(ggm1, newx = future_times)
# Convert cumulative sales forecast to instantaneous sales
ggm_future_inst <- make.instantaneous(ggm_future_cum)[-1]
# Get the end time of sales_m_ts
end_time <- tsp(sales_m_ts)[2] # End time of original series
# Start the forecast at the next period
forecast_start <- end_time + 1 / frequency(sales_m_ts)
# Create a time series object for the forecast
ggm_future_inst_ts <- ts(
ggm_future_inst,
start = forecast_start,
frequency = frequency(sales_m_ts)
)
# Step 5: Plot the forecast with aligned index
plot(
sales_m_ts,
type = "o",
col = "black",
lwd = 2,
main = "Original, Fitted, and Forecasted Instantaneous Sales",
xlab = "Time",
ylab = "Monthly Sales",
xlim = c(start(sales_m_ts)[1], forecast_start + 38 / frequency(sales_m_ts))
)
lines(
pred.inst_ggm_m,
col = "red",
lty = 2,
lwd = 2
)
lines(
ggm_future_inst_ts,
col = "blue",
lty = 2,
lwd = 2
)
legend(
"bottomright",
legend = c("Original Sales", "Fitted Values", "Forecast"),
col = c("black", "red", "blue"),
lty = c(1, 2, 2),
lwd = c(2, 2, 2)
)
The forecast of the GGM looks like the restaurant is loosing grip, actually restaurants can do well for way longer than the estimated period by the GGM. So we take this as a downside scenario forecast. But lets try to forecast a baseline scenario, with the second best model: the Holt-Winters.
# Step 1: Re-Fit the Holt-Winters model
hw2_m <- hw(sales_m_ts, seasonal = "multiplicative")
# Step 2: Generate the forecast for the next 24 months
forecast_hw <- forecast(hw2_m, h = 24)
# Step 3: Use autoplot to plot the original values and the forecast
autoplot(forecast_hw) +
ggtitle("Holt-Winters Forecast and Original Sales") +
xlab("Time") +
ylab("Monthly Sales") +
theme_minimal()
# Compare actual vs forecasted
actual_sales_m <- df_sales_m_test$sales_m
forecasted_hw <- forecast_hw$mean[1:2]
forecasted_ggm <- ggm_future_inst_ts[1:2]
rmse_test_hw <-calculate_rmse(actual_sales_m, forecasted_hw)
rmse_test_ggm <- calculate_rmse(actual_sales_m, forecasted_ggm)
cat("Error Holt-Winters:", rmse_test_hw, "\n")
Error Holt-Winters: 33822273
cat("Error GGM:", rmse_test_ggm, "\n")
Error GGM: 17357973
The GGM error is lower, we can do a combination (smoothing) of both scenarios for the final forecast
# Combine the actual and forecasted data into a data frame
plot_data <- data.frame(
Period = c("Last Period 1", "Last Period 2"),
Actual = actual_sales_m,
HoltWinters = forecasted_hw,
GGM = forecasted_ggm
)
plot_data_melt <- melt(plot_data, id.vars = "Period", variable.name = "Type", value.name = "Sales")
# Create a plot
ggplot(plot_data_melt, aes(x = Period, y = Sales, fill = Type)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Actual vs Forecasted Sales",
x = "Period",
y = "Sales",
fill = "Type") +
theme_minimal()
# Filter forecast for November and December 2024
forecast_nov_dec_w <- forecast_sales_w %>%
filter(ds >= as.Date("2024-11-01") & ds <= as.Date("2025-01-05"))
# Plot the actual sales with black dots
plot(as.Date(df_sales_w_test$week), df_sales_w_test$sales_w,
type = "p", # Points for actual sales
pch = 16, # Solid circles
col = "black", # Black color for points
xlab = "Date", # X-axis label
ylab = "Sales", # Y-axis label
main = "Actual Sales vs Forecasted Sales (Nov-Dec 2024)") # Title
# Add the forecast line in red
lines(as.Date(forecast_nov_dec_w$ds), forecast_nov_dec_w$yhat,
col = "red", # Red color for the line
lwd = 2) # Wider line
# Add a legend in the top right
legend("topright",
legend = c("Actual Sales", "Forecasted Sales"),
col = c("black", "red"), # Colors matching the plot
pch = c(16, NA), # Solid dot for actual sales, none for line
lty = c(NA, 1), # Line for forecasted sales, none for dot
lwd = c(NA, 2)) # Line width for the forecast
# Filter forecast for November and December 2024
forecast_nov_dec_d <- forecast_sales_d %>%
filter(ds >= as.Date("2024-11-01") & ds <= as.Date("2025-01-05"))
Warning: There were 2 warnings in `filter()`.
The first warning was:
ℹ In argument: `ds >= as.Date("2024-11-01") & ds <=
as.Date("2025-01-05")`.
Caused by warning in `check_tzones()`:
! 'tzone' attributes are inconsistent
ℹ Run ]8;;ide:run:dplyr::last_dplyr_warnings()dplyr::last_dplyr_warnings()]8;; to see the 1 remaining
warning.
# Plot the actual sales with black dots
plot(as.Date(test_sales_df$date), test_sales_df$sales_cop,
type = "p", # Points for actual sales
pch = 16, # Solid circles
col = "black", # Black color for points
xlab = "Date", # X-axis label
ylab = "Sales", # Y-axis label
main = "Actual Sales vs Forecasted Sales (Nov-Dec 2024)") # Title
# Add the forecast line in red
lines(as.Date(forecast_nov_dec_d$ds), forecast_nov_dec_d$yhat,
col = "red", # Red color for the line
lwd = 2) # Wider line
# Add a legend in the top right
legend("topright",
legend = c("Actual Sales", "Forecasted Sales"),
col = c("black", "red"), # Colors matching the plot
pch = c(16, NA), # Solid dot for actual sales, none for line
lty = c(NA, 1), # Line for forecasted sales, none for dot
lwd = c(NA, 2)) # Line width for the forecast
Finally, we want to use the long term forecast, so we can use the monthly combination of models for that. ### Montly
long_term_forecasted_hw <- forecast_hw$mean
long_term_forecasted_ggm <- ggm_future_inst_ts
# Define the date ranges for both series
dates_hw <- seq(as.Date("2024-11-01"), as.Date("2026-10-01"), by = "month")
dates_ggm <- seq(as.Date("2024-11-01"), as.Date("2027-11-01"), by = "month")
# Create data frames for the two series
df_hw <- data.frame(Date = dates_hw, Value = long_term_forecasted_hw)
df_ggm <- data.frame(Date = dates_ggm, Value = long_term_forecasted_ggm)
# Merge the two series on the common dates
df_merged <- merge(df_hw, df_ggm, by = "Date", suffixes = c("_hw", "_ggm"))
# Calculate the mean of both series where data is available
df_merged$Mean <- rowMeans(df_merged[, c("Value_hw", "Value_ggm")], na.rm = TRUE)
# Create the new time series
long_term_forecast <- data.frame(Date = df_merged$Date, Mean = df_merged$Mean)
plot(long_term_forecast)